import React from 'react';
import { PageProps } from 'gatsby';
import { useStaticQuery, graphql } from 'gatsby';
import { useInView } from 'react-intersection-observer';

import MarketingLayout from '@/components/common/hl-marketing-layout';
import FormSection from '@/components/hallucination-index-2023/form-section';
import SEO from '@/components/seo';

import Magnifier from '@/assets/icon-magnifier.svg';
import Bubble from '@/assets/icon-bubble.svg';
import Inspect from '@/assets/icon-inspect.svg';
import Check from '@/assets/icon-check.svg';
import { PerformanceTabs } from '@/components/hallucination-index-2023/tabs';

import Logo from '@/assets/logo-pure-white.svg';

import { ChainPollTable } from '@/components/common/scroll-wrap';

import { metricItems } from '@/utils/hallucination-index-data';
import Prefooter from '@/components/hallucination-index-2023/prefooter';
import { smoothScrollTo } from '../../components/common/smooth-scroll';

const aboutIndex = [
  {
    title: 'Why',
    icon: `<svg width="30" height="30" viewBox="0 0 30 30" fill="none" xmlns="http://www.w3.org/2000/svg">
    <path fill-rule="evenodd" clip-rule="evenodd" d="M6 5C6 4.44772 6.44772 4 7 4H18.1716C18.4368 4 18.6911 4.10536 18.8787 4.2929L23.7072 9.12132C23.8946 9.30886 24 9.56322 24 9.82842V25C24 25.5522 23.5522 26 23 26H7C6.44772 26 6 25.5522 6 25V5ZM7 2C5.34314 2 4 3.34314 4 5V25C4 26.6568 5.34314 28 7 28H23C24.6568 28 26 26.6568 26 25V9.82842C26 9.03278 25.684 8.26972 25.1214 7.7071L20.2928 2.87868C19.7303 2.31608 18.9672 2 18.1716 2H7ZM9 8C8.44772 8 8 8.44772 8 9C8 9.55228 8.44772 10 9 10H15C15.5523 10 16 9.55228 16 9C16 8.44772 15.5523 8 15 8H9ZM9 14C8.44772 14 8 14.4477 8 15C8 15.5523 8.44772 16 9 16H21C21.5522 16 22 15.5523 22 15C22 14.4477 21.5522 14 21 14H9ZM9 20C8.44772 20 8 20.4478 8 21C8 21.5522 8.44772 22 9 22H21C21.5522 22 22 21.5522 22 21C22 20.4478 21.5522 20 21 20H9Z" fill="black"/>
    </svg>
    `,
    text: `There has yet to be an LLM benchmark report that
    provides a comprehensive measurement of LLM hallucinations. After
    all, measuring hallucinations is difficult, as LLM performance
    varies by task type, dataset, context and more. Further, there
    isn’t a consistent set of metrics for measuring hallucinations.`,
  },
  {
    title: 'What',
    icon: `<svg width="30" height="30" viewBox="0 0 30 30" fill="none" xmlns="http://www.w3.org/2000/svg">
    <g clip-path="url(#clip0_286_720)">
    <path fill-rule="evenodd" clip-rule="evenodd" d="M1 0C0.447716 0 0 0.447716 0 1V29C0 29.5522 0.447716 30 1 30H9C9.55228 30 10 29.5522 10 29V10H29C29.5522 10 30 9.55228 30 9V1C30 0.447716 29.5522 0 29 0H1ZM2 8.1501V2H8.15004L8.1501 8.15014L2 8.1501ZM1.99999 9.8501L2 28H8V25.85H5.50004C5.0306 25.85 4.65004 25.4696 4.65004 25C4.65004 24.5306 5.0306 24.15 5.50004 24.15H8V21.85H4.50004C4.0306 21.85 3.65004 21.4694 3.65004 21C3.65004 20.5306 4.0306 20.15 4.50004 20.15H8V17.8501L5.50004 17.85C5.0306 17.85 4.65004 17.4695 4.65004 17C4.65006 16.5306 5.03062 16.15 5.50006 16.15L8 16.1501V13.85L5.49998 13.85C5.03054 13.85 4.65 13.4694 4.65 13C4.65 12.5306 5.03058 12.15 5.50002 12.15L8 12.15V9.85014L1.99999 9.8501ZM9.8501 8H12.15V5.50012C12.15 5.03068 12.5306 4.65012 13 4.65012C13.4694 4.65012 13.85 5.03068 13.85 5.50012V8H16.15V5.5001C16.15 5.03066 16.5306 4.6501 17 4.6501C17.4695 4.6501 17.85 5.03066 17.85 5.5001V8H20.1502V4.5001C20.1502 4.03066 20.5306 3.6501 21.0002 3.6501C21.4696 3.6501 21.8502 4.03066 21.8502 4.5001V8H24.1502V5.5001C24.1502 5.03066 24.5308 4.6501 25.0002 4.6501C25.4696 4.6501 25.8502 5.03066 25.8502 5.5001V8H28V2H9.85004L9.8501 8Z" fill="black"/>
    </g>
    <defs>
    <clipPath id="clip0_286_720">
    <rect width="30" height="30" fill="white"/>
    </clipPath>
    </defs>
    </svg>
    `,
    text: `The Hallucination Index ranks popular LLMs based on
    their propensity to hallucinate across three common task types -
    question & answer without RAG, question and answer with RAG, and
    long-form text generation.`,
  },
  {
    title: 'How',
    icon: `<svg width="30" height="30" viewBox="0 0 30 30" fill="none" xmlns="http://www.w3.org/2000/svg">
    <path fill-rule="evenodd" clip-rule="evenodd" d="M1.80005 15C1.80005 7.7099 7.7099 1.80005 15 1.80005C22.2902 1.80005 28.2 7.7099 28.2 15C28.2 22.2902 22.2902 28.2 15 28.2C7.7099 28.2 1.80005 22.2902 1.80005 15ZM15 3.60004C8.70402 3.60004 3.60004 8.70402 3.60004 15C3.60004 21.296 8.70402 26.4 15 26.4C21.296 26.4 26.4 21.296 26.4 15C26.4 8.70402 21.296 3.60004 15 3.60004ZM6.15008 15C6.15008 10.1123 10.1124 6.15004 15.0001 6.15004C19.8878 6.15004 23.85 10.1123 23.85 15C23.85 19.8877 19.8878 23.85 15.0001 23.85C10.1124 23.85 6.15008 19.8877 6.15008 15ZM15.0001 7.85004C11.0512 7.85004 7.85008 11.0512 7.85008 15C7.85008 18.9488 11.0513 22.15 15.0001 22.15C18.9489 22.15 22.15 18.9488 22.15 15C22.15 11.0512 18.9489 7.85004 15.0001 7.85004ZM15.0001 10.5C12.5148 10.5 10.5001 12.5148 10.5001 15C10.5001 17.4853 12.5148 19.5 15.0001 19.5C17.4853 19.5 19.5001 17.4853 19.5001 15C19.5001 12.5148 17.4853 10.5 15.0001 10.5ZM12.1001 15C12.1001 13.3984 13.3985 12.1 15.0001 12.1C16.6017 12.1 17.9001 13.3984 17.9001 15C17.9001 16.6017 16.6017 17.9 15.0001 17.9C13.3985 17.9 12.1001 16.6017 12.1001 15Z" fill="black"/>
    </svg>
    `,
    text: `The Index ranks 11 leading LLMs performance across
    three task types. The LLMs were evaluated using seven popular
    datasets. To measure hallucinations, the Hallucination Index
    employs two metrics,
    <a href="https://docs.galileo.ai/galileo/how-to-and-faq/ml-research-algorithms/guardrail-metrics/factuality">
      Correctness
    </a>
    and
    <a
      target="_blank"
      href="https://docs.galileo.ai/galileo/how-to-and-faq/ml-research-algorithms/guardrail-metrics/groundedness"
    >
      Context Adherence</a>, which are built with the state-of-the-art evaluation method
    <a
      target="_blank"
      href="https://www.galileo.ai/blog/chainpoll"
    >
      ChainPoll</a>.`,
    items: [
      {
        name: 'Rows of text',
        value: '20k+',
      },
      {
        name: 'Popular LLMs',
        value: '11',
      },
      {
        name: 'Task Types',
        value: '3',
      },
    ],
  },
];

const formList = [
  { name: 'Detect model hallucinations', icon: <Magnifier /> },
  { name: 'Find the best prompt', icon: <Bubble /> },
  { name: 'Inspect data errors while fine-tuning', icon: <Inspect /> },
  { name: 'Works with popular tools and models', icon: <Check /> },
];

const LlmHero = () => {
  const query = useStaticQuery(graphql`
    {
      joined: file(relativePath: { eq: "joined.png" }) {
        childImageSharp {
          gatsbyImageData(
            layout: CONSTRAINED
            quality: 100
            width: 102
            formats: [AUTO, WEBP, AVIF]
          )
        }
      }

      taskPerformance: allTaskperformanceCsv {
        edges {
          node {
            Model
            Long_form_text_generation
            QA_with_RAG
            QA_without_RAG
          }
        }
      }

      taskInsights: allTaskinsightCsv {
        edges {
          node {
            Insight
            Model
            Task
          }
        }
      }
    }
  `);

  const { joined, taskPerformance, taskInsights } = query;

  const chartData = taskPerformance.edges.map((item) => item.node);

  const { ref, inView } = useInView({
    triggerOnce: false,
    rootMargin: '100px 0px',
  });

  return (
    <section className="relative">
      {/* header */}

      <div className="h-screen max-h-[720px] bg-hi-60 pt-16 lg:max-h-[1024px] lg:pt-20">
        <div className="relative mx-auto flex h-full w-full max-w-[1280px] flex-col justify-between pb-14 text-white lg:pb-16">
          <div className="flex grow flex-col items-center justify-center px-4 py-10">
            <div className="hallucination w-full">
              <div className="hal-row">
                <span className="rotate1 time1">M</span>
                <span className="rotate2 time2">E</span>
                <span className="rotate3 time3">T</span>
                <span className="rotate4 time4">R</span>
                <span className="rotate5 time5">I</span>
                <span className="rotate6 time6">C</span>
                <span className="rotate7 time7">P</span>
                <span className="rotate8 time8">R</span>
                <span className="rotate9 time9">O</span>
                <span className="rotate10 time10">M</span>
                <span className="rotate2 time2">P</span>
                <span className="rotate3 time3">T</span>
                <span className="rotate4 time5">D</span>
                <span className="rotate5 time2">A</span>
                <span className="rotate6 time10">T</span>
                <span className="rotate8 time8">A</span>
                <span className="rotate1 time9">S</span>
                <span className="rotate3 time7">E</span>
                <span className="rotate7 time6">T</span>
                <span className="rotate2 time5">M</span>
                <span className="rotate3 time4">O</span>
                <span className="rotate8 time2">D</span>
                <span className="rotate9 time3">E</span>
                <span className="rotate10 time1">L</span>
              </div>

              <div className="hal-row">
                <span className="rotate1 time5">E</span>
                <span className="rotate3 time2">X</span>
                <span className="rotate5 time10">P</span>
                <span className="rotate7 time9">E</span>
                <span className="rotate9 time8">R</span>
                <span className="rotate10 time7">I</span>
                <span className="rotate2 time5">M</span>
                <span className="rotate4 time6">E</span>
                <span className="rotate6 time4">N</span>
                <span className="rotate8 time3">T</span>
                <span className="rotate10 time2">M</span>
                <span className="rotate9 time1">E</span>
                <span className="rotate8 time2">T</span>
                <span className="rotate7 time3">H</span>
                <span className="rotate6 time6">O</span>
                <span className="rotate5 time4">D</span>
                <span className="rotate4 time8">I</span>
                <span className="rotate3 time9">N</span>
                <span className="rotate2 time10">S</span>
                <span className="rotate1 time4">I</span>
                <span className="rotate7 time5">G</span>
                <span className="rotate5 time3">H</span>
                <span className="rotate6 time2">T</span>
                <span className="rotate2 time1">S</span>
              </div>

              <div className="hal-row">
                <span className="rotate7 time5">D</span>
                <span className="rotate6 time4">E</span>
                <span className="rotate5 time7">S</span>
                <span className="rotate4 time6">I</span>
                <span className="rotate3 time9">G</span>
                <span className="rotate2 time10">N</span>
                <span className="rotate1 time3">R</span>
                <span className="rotate10 time1">A</span>
                <span className="rotate8 time7">G</span>
                <span className="rotate5 time9">L</span>
                <span className="rotate6 time2">L</span>
                <span className="rotate3 time3">M</span>
                <span className="rotate2 time4">E</span>
                <span className="rotate4 time5">X</span>
                <span className="rotate5 time7">A</span>
                <span className="rotate7 time5">M</span>
                <span className="rotate9 time6">P</span>
                <span className="rotate6 time8">L</span>
                <span className="rotate3 time9">E</span>
                <span className="rotate2 time7">H</span>
                <span className="rotate1 time1">U</span>
                <span className="rotate4 time2">M</span>
                <span className="rotate7 time3">A</span>
                <span className="rotate8 time2">N</span>
              </div>

              <div className="hal-row">
                <span className="rotate5 time5">E</span>
                <span className="rotate3 time6">L</span>
                <span className="rotate10 time2">A</span>
                <span className="rotate5 time3">G</span>
                <span className="rotate8 time2">U</span>
                <span className="rotate7 time10">E</span>
                <span className="rotate6 time8">N</span>
                <span className="rotate4 time3">R</span>
                <span className="rotate1 time7">O</span>
                <span className="rotate3 time1">N</span>
                <span className="rotate2 time2">&</span>
                <span className="rotate5 time3">P</span>
                <span className="rotate6 time1">A</span>
                <span className="rotate9 time5">R</span>
                <span className="rotate5 time3">K</span>
                <span className="rotate6 time10">I</span>
                <span className="rotate7 time9">N</span>
                <span className="rotate9 time8">G</span>
                <span className="rotate8 time6">F</span>
                <span className="rotate2 time4">A</span>
                <span className="rotate3 time3">L</span>
                <span className="rotate4 time1">C</span>
                <span className="rotate2 time2">O</span>
                <span className="rotate2 time5">N</span>
              </div>

              <div className="hal-row">
                <span className="rotate5 time7">C</span>
                <span className="rotate6 time5">A</span>
                <span className="rotate7 time6">T</span>
                <span className="rotate8 time1">N</span>
                <span className="rotate9 time3">N</span>
                <span className="rotate2 time4">R</span>
                <span className="rotate3 time9">L</span>
                <span className="rotate2 time8">U</span>
                <span className="rotate1 time7">C</span>
                <span className="rotate10 time6">Y</span>
                <span className="rotate9 time6">N</span>
                <span className="rotate5 time5">O</span>
                <span className="rotate3 time7">T</span>
                <span className="rotate5 time9">I</span>
                <span className="rotate7 time3">O</span>
                <span className="rotate6 time2">N</span>
                <span className="rotate2 time1">I</span>
                <span className="rotate1 time7">N</span>
                <span className="rotate7 time6">D</span>
                <span className="rotate9 time7">E</span>
                <span className="rotate6 time8">X</span>
                <span className="rotate8 time4">G</span>
                <span className="rotate1 time10">P</span>
                <span className="rotate3 time1">T</span>
              </div>
            </div>
          </div>
          <div className="relative z-10 flex flex-wrap justify-end lg:flex-row lg:items-end lg:justify-between">
            <div className="w-full px-4">
              <h1 className="text-[40px] font-normal leading-none sm:text-[48px] md:text-[56px] lg:text-[64px] xl:text-[72px]">
                LLM Hallucination Index
              </h1>
              <p className="my-4 text-[18px] md:text-[20px] lg:text-[24px] xl:text-[28px]">
                A Ranking & Evaluation Framework For LLM Hallucinations
              </p>
              <p className="mt-4 lg:mt-8">
                <a
                  onClick={() => {
                    smoothScrollTo('report');
                  }}
                  className="inline-block cursor-pointer bg-hi-100 px-10 py-3 text-center font-medium text-white duration-200 hover:bg-hi-20 lg:px-14 lg:py-4"
                >
                  Get The Full Report
                </a>
              </p>
            </div>
            <div className="mt-8 flex w-full flex-row items-center justify-start gap-2 px-4 sm:justify-end">
              <p className="whitespace-nowrap text-xs opacity-70 lg:text-sm">
                Brought to you by
              </p>
              <div className="w-full max-w-[80px] text-white opacity-70 lg:max-w-[120px]">
                <Logo />
              </div>
            </div>
          </div>
        </div>
      </div>

      <section className="section-wrapper py-20 sm:py-24 md:py-28 xl:py-32">
        <div className="mx-auto h-full max-w-[1280px] space-y-8 px-4">
          <h2 className="font-serif text-[32px] font-normal md:text-[36px] lg:text-[48px]">
            👋 Welcome to the Hallucination Index!
          </h2>

          <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
            Many enterprise teams have already successfully deployed LLMs in
            production, and many others have committed to deploying Generative
            AI products in 2024. However, for enterprise AI teams, the biggest
            hurdle to deploying production-ready Generative AI products remains
            the fear of model hallucinations – a catch-all phrase for when the
            model generates text that is incorrect or fabricated. There can be
            several reasons for this, such as a lack of the model’s capacity to
            memorize all of the information it was fed, training data errors,
            and outdated training data.
          </p>

          <h3 className="font-serif text-[24px] font-normal md:text-[28px] lg:text-[32px] xl:text-[36px]">
            Why another benchmark?
          </h3>

          <p className="leading-loose">
            There are a few LLM benchmarks today. While these benchmarks do much
            to advance the adoption of LLMs, they have a few critical
            blindspots.
          </p>

          <ul className="list-disc pl-4">
            <li className="leading-loose">
              <b>Not focused on LLM output quality</b>: Existing benchmarks
              provide a generic evaluation of LLM attributes and performance,
              and not a focused evaluation of the quality of the LLMs output
              (hallucination likelihood). As a result, these benchmarks do not
              leverage metrics that measure the actual quality of LLM outputs –
              one of the top concerns for enterprise GenAI teams today.
            </li>
            <li className="leading-loose">
              <b>Not focused on task type</b>: A practical benchmark useful for
              Enterprise genAI teams needs to cater to the variability in task
              types. For instance, a model that works well for chat, might not
              be great at text summarization.
            </li>
            <li className="leading-loose">
              <b>Not focused on the power of context</b>: Retrieval augmented
              generation (RAG) is a popular technique across teams to provide
              LLMs with useful context. LLM benchmarks today ignore how they
              perform with context – granted there is nuance here with regards
              to the quality of the context, but measuring variability in LLM
              performance across RAG vs non-RAG tasks is critical.
            </li>
          </ul>

          <p className="leading-loose">
            The Hallucination Index offers a structured approach to assess and
            measure hallucinations as an endeavor to help teams build more
            trustworthy GenAI applications.
          </p>

          <h3 className="font-serif text-[24px] font-normal md:text-[28px] lg:text-[32px] xl:text-[36px]">
            About the index
          </h3>

          <div className="mt-12">
            <div className="grid grid-cols-1 gap-4 md:gap-4 lg:grid-cols-3 lg:gap-5">
              {aboutIndex.map((i, idx) => {
                return (
                  <div className="border border-hi-3" key={`title-${idx}`}>
                    <div className="flex flex-row items-center space-x-2 border-b border-hi-3 bg-hi-2 p-4 md:p-4 lg:p-5 xl:p-6">
                      <div>
                        <div dangerouslySetInnerHTML={{ __html: i.icon }} />
                      </div>
                      <p className="text-[18px] md:text-[20px] lg:text-[24px]">
                        {i.title}
                      </p>
                    </div>

                    <div className="p-4 md:p-4 lg:p-5 xl:p-6">
                      <p
                        dangerouslySetInnerHTML={{ __html: i.text }}
                        className="text-base font-light leading-loose text-dark-80 lg:max-w-sm"
                      ></p>

                      {i.items && i.items.length > 0 && (
                        <div className="mt-4 flex flex-row -space-x-px">
                          {i.items.map((item, idx) => {
                            return (
                              <div
                                className="w-40 border border-hi-3 p-2 lg:p-3"
                                key={`exec-${idx}`}
                              >
                                <p className="mb-3 text-base text-sm font-bold text-dark-80 md:text-base lg:text-xl">
                                  {item.value}
                                </p>
                                <p className="text-sm font-light text-dark-80">
                                  {item.name}
                                </p>
                              </div>
                            );
                          })}
                        </div>
                      )}
                    </div>
                  </div>
                );
              })}
            </div>
          </div>
          <p className="leading-loose">
            To learn more about our Methodology,{' '}
            <a href="/hallucinationindex-2023/methodology">click here</a>.
          </p>
        </div>
      </section>

      <div className="mx-auto mb-6 h-full max-w-[1280px] px-4">
        <p className="text-[16px] md:text-[18px] lg:text-[20px] xl:text-[22px]">
          Hallucination Index
        </p>
        <h3 className="font-serif text-[28px] font-normal md:text-[36px] lg:text-[40px] xl:text-[48px]">
          LLM Rankings by Task Type
        </h3>
      </div>
      <PerformanceTabs
        performanceData={chartData}
        taskInsights={taskInsights}
      />

      <section className="-mb-16 bg-white py-20 sm:py-24 md:py-28 xl:py-32">
        <div className="mx-auto h-full max-w-[1280px] px-4">
          <h2 className="font-serif text-[28px] font-normal md:text-[36px] lg:text-[40px] xl:text-[48px]">
            Evaluation Methodology
          </h2>
          <hr className="my-2 opacity-0" />
          <p className="mb-3 leading-loose">
            <b>Dataset</b> <br />
            We use standard datasets and create appropriate prompts for each
            model.
          </p>
          <p className="mb-3 leading-loose">
            <b>Generation</b>
            <br /> Generations are done with similar, model specific prompts,
            without CoT(Chain of thought), and using the same text generation
            configurations (i.e. hyper-parameters).
          </p>

          <p className="mb-3 leading-loose">
            <b>Evaluation</b> <br /> We use an LLM-based evaluation for
            scalability, both in cost and time.to. Specifically, we use the
            state of the art ChainPoll metric to evaluate propensity for
            hallucination.
          </p>
          <p className="mb-3 leading-loose">
            <b>ChainPoll Efficacy</b> <br />
            We leverage extensive human annotation to confirm the reliability of
            the ChainPoll metric for each task type.
          </p>
          <p className="mb-3 leading-loose">
            <b>Task score</b> <br />
            The final score is calculated as the mean of dataset scores for the
            task. The dataset score is the mean of ChainPoll score for each
            sample in the dataset. We emphasize that this score is an LLM based
            score and not a human evaluation score.
          </p>

          <div className="mt-8">
            <p className="mb-2 font-bold">ChainPoll</p>
            <p className="mb-4 leading-loose">
              ChainPoll, developed by Galileo Labs, is an innovative and
              cost-effective hallucination detection method for large language
              models (LLMs), and RealHall is a set of challenging, real-world
              benchmark datasets. Our extensive comparisons show ChainPoll's
              superior performance in detecting LLM hallucinations,
              outperforming existing metrics such as with a significant margin
              in accuracy, transparency, and efficiency, while also introducing
              new metrics for evaluating LLMs' adherence and correctness in
              complex reasoning tasks.
            </p>

            <p className="text-sm lg:text-base">
              <a
                href="/hallucinationindex-2023/methodology"
                className="group flex items-center font-medium text-hi-100"
              >
                Learn More
              </a>
            </p>

            <hr className="my-4 opacity-0" />

            <div className="boder-hi-3 border p-4 md:p-5 lg:p-6 xl:p-8">
              <ChainPollTable
                theaderOne={'Metric'}
                theaderTwo={'Aggregate AUROC'}
                items={metricItems}
              />
            </div>
          </div>
        </div>
      </section>

      <FormSection />

      <Prefooter />
    </section>
  );
};

const HallucinationIndexPage: React.FC<PageProps> = () => {
  return (
    <MarketingLayout year={2023}>
      <LlmHero />
    </MarketingLayout>
  );
};

export default HallucinationIndexPage;

export const Head = () => (
  <SEO
    title={`LLM Hallucination Index 2023`}
    description={`LLM Hallucination Index 2023. A Ranking & Evaluation Framework For LLM Hallucinations`}
    image="/hallucination-index-og.png"
    imageWidth={1200}
    imageHeight={670}
  />
);
