import React, { useState, useRef, useEffect } from 'react';
import { PageProps } from 'gatsby';
import { useStaticQuery, graphql } from 'gatsby';
import { useInView } from 'react-intersection-observer';
import { GatsbyImage } from 'gatsby-plugin-image';

import PreFooter from '@/components/hallucination-index-2024/pre-footer';
import WavyBackground from '@/components/hallucination-index-2024/methodology/wave-background';
import MethodologyHowDoesItWork from '@/assets/methodology-how-does-it-work.svg';

import MarketingLayout from '@/components/common/hl-marketing-layout';
import SEO from '@/components/seo';

import { ChainPollTable2024 } from '@/components/common/scroll-wrap';

const CodeBlock = () => {
  return (
    <pre className="mt-4 overflow-auto border border-hi-3 bg-hi-2 p-4">
      <code>
        {`{
  "max_new_tokens": 1024,
  "temperature":0.1,
  "top_p":0.9,
  "do_sample":True,
  "seed": 0,
  "best_of": 1,
}`}
      </code>
    </pre>
  );
};

import {
  metricItems,
  modelsByCreators,
} from '@/utils/hallucination-index-data';

const aboutIndex = [
  {
    title: 'Context',
    text: 'While there is no shortage of benchmark indices in the market, very few measure hallucinations. Measuring hallucinations is difficult, as LLM performance varies based on use-case, dataset, and more. Furthermore, there’s no consistent framework and set of metrics for measuring and evaluating hallucinations. ',
  },
  {
    title: 'Design',
    text: 'Hallucination Index is built to measure the efficacy of LLMs across 3 common use-cases - Q&A without context, Q&A with context, and long-form text generation. It measures the hallucination of models in free form text.',
  },
  {
    title: 'Execution',
    text: 'The Index ranks 11 leading LLMs based on their propensity to hallucinate for each use cases. We select 7 standard datasets to create tasks which match the requirement in the industry. LLMs are evaluated using 2 metrics, Correctness and Context Adherence which are built with state of the art technique ChainPoll.',
  },
];

const dataTableItems = [
  {
    title: 'Definition',
    td1: 'Generates short answer for a question without context',
    td2: 'Generates an answer for a question with context',
    td3: 'Generates long answer based on a prompt',
  },

  {
    title: 'Datasets',
    td1: `
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/truthful_qa">TruthfulQA</a>,
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/trivia_qa">TriviaQA</a>
    `,
    td2: `
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/narrativeqa">NarrativeQA</a>,
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/drop">DROP</a>,
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/ms_marco">MS Marco</a>,
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/hotpot_qa/viewer/distractor/validation">HotpotQA distractor test</a>
    `,
    td3: `<a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/OpenAssistant/oasst1">Open Assistant</a>`,
  },
];

const SectionWrapper = ({ children, num, title }) => {
  return (
    <div className="section-wrapper overflow-hidden border border-gray-400/40 py-8 sm:py-10 md:py-12 lg:py-14 xl:py-16">
      <div className="top-14 z-10 flex flex-row items-center bg-white py-3 lg:top-20">
        <div className="w-auto shrink px-4">
          <div className="">
            <div className="flex h-9 w-9 items-center justify-center rounded-lg border border-hi-3 text-[20px] font-bold">
              {num}
            </div>
          </div>
        </div>
        <div className="w-auto w-full grow">
          <h3 className="text-[18px] font-semibold md:text-[24px] lg:text-[28px] xl:text-[32px]">
            {title}
          </h3>
        </div>
      </div>
      <div className="-mx-4 flex flex-col bg-white md:flex-row md:flex-nowrap">
        <div className="hidden w-auto shrink px-4 md:block">
          <div className="">
            <div className="h-10 w-10 lg:text-[28px]"></div>
          </div>
        </div>
        <div className="w-auto w-full grow px-8 lg:px-4">
          <hr className="my-2 opacity-0" />
          {children}
        </div>
      </div>
    </div>
  );
};

const ShowMoreSection = () => {
  return (
    <div className="space-y-6 lg:space-y-10">
      <div>
        <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
          Q&A without RAG
        </p>
        <hr className="mb-4 mt-2 border-t border-hi-3 xl:mb-6 xl:mt-4" />
        <ul className="list-disc space-y-4 pl-8">
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://arxiv.org/abs/2109.07958"
            >
              TruthfulQA
            </a>
            : A benchmark to measure biases of large language models using
            question answering task.
          </li>
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://nlp.cs.washington.edu/triviaqa/"
            >
              TriviaQA
            </a>
            : A reading comprehension dataset containing
            question-answer-evidence triples.
          </li>
        </ul>
      </div>
      <div>
        <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
          Q&A with RAG
        </p>
        <hr className="mb-4 mt-2 border-t border-hi-3 xl:mb-6 xl:mt-4" />
        <ul className="list-disc space-y-4 pl-8">
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://huggingface.co/datasets/narrativeqa"
            >
              NarrativeQA
            </a>
            : A dataset of stories and corresponding questions designed to test
            reading comprehension, especially on long documents.
          </li>
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://huggingface.co/datasets/mrqa"
            >
              DROP
            </a>
            : Reading comprehension benchmark which requires Discrete Reasoning
            Over the content of Paragraphs. Answering requires resolving
            references in a question, perhaps to multiple input positions, and
            performing discrete operations over them (such as addition,
            counting, or sorting).
          </li>
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://microsoft.github.io/msmarco/"
            >
              Microsoft MS Macro
            </a>
            : A dataset containing queries and paragraphs with relevance labels.
          </li>
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://huggingface.co/datasets/hotpot_qa"
            >
              HotpotQA
            </a>
            : A dataset with Wikipedia-based question-answer pairs that require
            finding and reasoning over multiple supporting documents to answer.
          </li>
        </ul>
      </div>
      <div>
        <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
          Long-form text Generation
        </p>
        <hr className="mb-4 mt-2 border-t border-hi-3 xl:mb-6 xl:mt-4" />
        <ul className="list-disc space-y-4 pl-8">
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://github.com/LAION-AI/Open-Assistant"
            >
              OpenAssistant
            </a>
            : A human-generated, human-annotated assistant-style conversation
            corpus. It covers factual questions on varied domains.
          </li>
        </ul>
      </div>
    </div>
  );
};

const LlmHero = () => {
  const query = useStaticQuery(graphql`
    {
      taskPerformance: allTaskperformanceCsv {
        edges {
          node {
            Model
            Long_form_text_generation
            QA_with_RAG
            QA_without_RAG
          }
        }
      }
    }
  `);

  const refSlider = useRef(null);

  // add scroll left and right functionality when dragging the refSlider

  useEffect(() => {
    if (refSlider.current) {
      const slider = refSlider.current;
      let isDown = false;
      let startX;
      let scrollLeft;

      slider.addEventListener('mousedown', (e) => {
        isDown = true;
        slider.classList.add('active');
        startX = e.pageX - slider.offsetLeft;
        scrollLeft = slider.scrollLeft;
      });

      slider.addEventListener('mouseleave', () => {
        isDown = false;
        slider.classList.remove('active');
      });

      slider.addEventListener('mouseup', () => {
        isDown = false;
        slider.classList.remove('active');
      });

      slider.addEventListener('mousemove', (e) => {
        if (!isDown) return;
        e.preventDefault();
        const x = e.pageX - slider.offsetLeft;
        const walk = (x - startX) * 1; //scroll-fast
        slider.scrollLeft = scrollLeft - walk;
      });
    }
  }, []);

  const { joined, taskPerformance } = query;

  const chartData = taskPerformance.edges.map((item) => item.node);

  const { ref, inView } = useInView({
    triggerOnce: false,
    rootMargin: '100px 0px',
  });

  return (
    <section className="relative pt-14 lg:pt-20">
      {/* header */}

      <section className="w-full bg-[#070707]">
        <div className="mx-auto max-w-[90rem] lg:px-[85px]">
          <div className="flex w-full flex-col p-0 md:flex-row lg:flex-row">
            <div className="flex items-center border border-gray-400/40 px-[14px] py-4 md:w-3/5 lg:w-[43%] lg:p-3">
              <h2 className="break-all font-inter text-[40px] font-bold leading-normal text-white md:text-[52px] lg:text-[72px] lg:leading-[80px]">
                Methodology
              </h2>
            </div>
            <WavyBackground
              waveWidth={250}
              speed="normal"
              className="inset-0 w-full p-0"
            >
              <div className="grid h-full grid-cols-12">
                {new Array(36).fill(0).map((_, i) => (
                  <div
                    key={i}
                    className="col-span-1 h-14 border border-gray-400/40 bg-transparent"
                  />
                ))}
              </div>
            </WavyBackground>
          </div>
          <p className="border-x border-gray-400/40 p-4 text-[18px] leading-loose text-white sm:text-[20px] lg:px-4 lg:py-14 lg:text-[22px]">
            The Hallucination Index is an ongoing initiative to evaluate and
            rank the largest and most popular LLMs on their propensity to
            hallucinate across common task types. The models were evaluated
            using a diverse set of datasets, chosen for their popularity and
            ability to challenge the models' abilities to stay on task. Below is
            the methodology used to create the Hallucination Index. Our RAG
            methodology is designed to rigorously evaluate RAG models across a
            variety of dimensions, ensuring both factual accuracy and contextual
            adherence.
          </p>
        </div>
      </section>

      <section className="">
        <div className="mx-auto h-full max-w-[90rem] px-4 lg:px-[85px]">
          <SectionWrapper num={1} title="Model Selection">
            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              The Hallucination Index evaluated the largest and most popular
              LLMs available today. These LLMs were chosen by surveying popular
              LLM repos, leaderboards, and industry surveys. The LLMs selected
              represent a combination of open-source and closed-source models of
              varying sizes. This domain is evolving, with new models being
              released weekly.
            </p>
            <p className="mt-4">
              The Hallucination Index will be updated every two quarters. To see
              an LLM added to the Index, contact us{' '}
              <a
                href="#newsLetterFormId-52a74452-6501-4d59-ad02-29f1202d6420"
                className="text-hi-100"
              >
                here.
              </a>
            </p>
            <hr className="my-4 opacity-0" />
            <div className="mt-4 hidden">
              <div className="relative -mr-4 overflow-hidden">
                {/* // add drap left and right via ref  */}
                <div
                  className="hide-scroll-bar relative cursor-grab overflow-auto active:cursor-grabbing"
                  ref={refSlider}
                >
                  <div className="flex flex-nowrap gap-4 pb-4">
                    {modelsByCreators.map((i, idx) => {
                      return (
                        <div className="flex-none basis-[248px] last:pr-4 md:basis-[256px] lg:basis-[268px]">
                          <div className="h-full border border-hi-3 bg-hi-2">
                            <div className="flex flex-row items-center space-x-2 border-b border-hi-3 bg-hi-2 p-4 lg:p-5">
                              <div>
                                <img
                                  width={24}
                                  height={24}
                                  src={`/creators/${i.creator}`}
                                />
                              </div>
                              <p className="">{i['creator-name']}</p>
                            </div>
                            <div className="p-4 lg:p-5">
                              <ul className="list-disc space-y-2.5 pl-4">
                                {i.models.map((m, idx) => {
                                  return (
                                    <li className="opacity-80">
                                      <a
                                        className="text-base font-normal leading-[175%] !text-black"
                                        href={`/hallucinationindex-2023/${m.Model.toLocaleLowerCase().replace(
                                          '.',
                                          '',
                                        )}`}
                                      >
                                        {m.Model}
                                      </a>
                                    </li>
                                  );
                                })}
                              </ul>
                            </div>
                          </div>
                        </div>
                      );
                    })}
                  </div>
                </div>
              </div>
            </div>
          </SectionWrapper>

          <SectionWrapper num={2} title="Task Type Selection">
            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Next, LLMs were tested across three common task types to observe
              their performance. We selected tasks relevant to developers and
              end-users and tested each LLM’s ability to operate with context of
              different lengths.
            </p>
            <hr className="my-4 opacity-0" />

            <p className="mb-4 text-[18px] font-medium leading-loose sm:text-[20px] lg:text-[22px]">
              Why short, medium & long context RAG tasks?
            </p>
            <p className="mb-4 leading-relaxed">
              Context length affects the design of a RAG system by influencing
              retrieval strategies, computational resource needs, and the
              balance between precision and breadth. We conducted 3 experiments
              to gauge the state of LLMs’ performance in different contexts
              lengths.
            </p>
            <p className="mb-4 leading-relaxed">
              For short context lengths (less than 5,000 tokens), the pros are
              faster responses, better precision, and simplicity. However, they
              can miss out on broader context and might overfit to narrow
              scenarios. There is also a higher reliance on vector database
              precision to ensure relevant information retrieval.
            </p>
            <p className="mb-4 leading-relaxed">
              Medium context lengths (5,000 to 25,000 tokens) offer a balance
              between detail and scope, providing more nuanced answers. They
              rely less on the pinpoint accuracy of vector databases, as they
              have more room to include context. However, they come with
              increased complexity and higher resource usage.
            </p>
            <p className="leading-relaxed">
              Long context lengths (40,000 to 100,000 tokens) handle detailed
              queries well, offering rich information and comprehensive
              understanding. Since extensive context can be included, the
              reliance on vector database precision decreases even further. The
              downside is slower response times, high computational costs, and
              potential inclusion of irrelevant information.
            </p>

            <p className="mb-4 mt-8 text-[18px] font-medium leading-loose sm:text-[20px] lg:text-[22px]">
              Short Context RAG
            </p>
            <p className="mb-4 leading-relaxed">
              The SCR evaluation utilizes a variety of demanding datasets to
              test the robustness of models in handling short contexts:
            </p>
            <p className="leading-relaxed">
              We employ Chainpoll with GPT-4o, which leverages the strong
              reasoning power of GPT series models. By using a chain of thought
              technique to poll the model multiple times, we can better judge
              the correctness of the responses. This not only provides a metric
              to quantify potential hallucinations but also offers explanations
              based on the provided context, a crucial feature for RAG systems.
            </p>

            <p className="mb-4 mt-8 text-[18px] font-medium leading-loose sm:text-[20px] lg:text-[22px]">
              Medium and Long Context RAG
            </p>
            <p className="mb-4 leading-relaxed">
              Our methodology focuses on models' ability to comprehensively
              understand extensive texts in medium and long contexts. 
            </p>
            <p className="mb-8 leading-relaxed">
              We extract text from very recent 10k documents of a company,
              divide it into chunks, and designate one of these chunks as the
              needle chunk. Using these chunks, we construct the necessary
              dataset by varying the location of the needle. We create a
              retrieval question that can be answered using the needle. The LLM
              has to answer the question using the context containing the
              needle.
            </p>

            <p className="mb-4 leading-relaxed">
              Medium context lengths - 5k, 10k, 15k, 20k, 25k
            </p>

            <p className="mb-8 leading-relaxed">
              Long context lengths - 40k, 60k, 80k, 100k
            </p>

            <p className="mb-4 leading-relaxed">
              We designed the task with these considerations:
            </p>

            <ul className="list-disc space-y-1 pl-4">
              <li>All the text in context should be of single domain.</li>
              <li>
                Response should always be correct with short context to confirm
                the influence of long context.
              </li>
              <li>
                The question cannot be answered from memory of pre-training. It
                should not be a general old fact.
              </li>
              <li>
                Measuring the influence of position requires keeping the
                context, information, and question the same and altering only
                the location of the information.
              </li>
              <li>Avoid standard dataset as there can be test leakage.</li>
            </ul>

            <p className="mb-4 mt-8 text-[18px] font-medium leading-loose sm:text-[20px] lg:text-[22px]">
              Evaluation
            </p>
            <p className="mb-4 leading-relaxed">
              Adherence to context is evaluated using a custom LLM-based
              assessment, checking for the relevant answer within the response.
            </p>
          </SectionWrapper>

          <SectionWrapper num={3} title="Dataset Selection">
            <p className="mb-4 mt-8 text-[18px] font-medium leading-loose sm:text-[20px] lg:text-[22px]">
              Short Context Rag
            </p>
            <p className="mb-4 leading-relaxed">
              The Hallucination Index assesses LLM performance by leveraging 4
              popular and 2 proprietary datasets. The datasets effectively
              challenge each LLM's capabilities relevant to the task at hand.
              For this task we convert the query and document to form the input
              prompt with context.
            </p>

            <p className="mb-4 leading-relaxed">
              <a
                href="https://huggingface.co/datasets/mrqa-workshop/mrqa"
                target="_blank"
              >
                DROP:
              </a>{' '}
              Reading comprehension benchmark which requires Discrete Reasoning
              Over the content of Paragraphs. Answering requires resolving
              references in a question, perhaps to multiple input positions, and
              performing discrete operations over them (such as addition,
              counting, or sorting). 
            </p>
            <p className="mb-4 leading-relaxed">
              <a href="https://microsoft.github.io/msmarco/" target="_blank">
                Microsoft MS Macro:
              </a>{' '}
              A dataset containing queries and paragraphs with relevance labels.
            </p>
            <p className="mb-4 leading-relaxed">
              <a
                href="https://huggingface.co/datasets/hotpotqa/hotpot_qa"
                target="_blank"
              >
                HotpotQA:
              </a>{' '}
              A dataset with Wikipedia-based question-answer pairs that require
              finding and reasoning over multiple supporting documents to
              answer. 
            </p>
            <p className="mb-4 leading-relaxed">
              <a href="https://arxiv.org/abs/2210.03849" target="_blank">
                ConvFinQA:
              </a>{' '}
              A dataset to study the chain of numerical reasoning in
              conversational question answering. It poses great challenge in
              modeling long-range, complex numerical reasoning paths in
              real-world conversations.
            </p>

            <p className="mb-4 mt-8 text-[18px] font-medium leading-loose sm:text-[20px] lg:text-[22px]">
              Medium Context RAG
            </p>
            <p className="mb-4 leading-relaxed">
              We extract text from very recent 10k documents of a company,
              divide it into chunks, and designate one of these chunks as the
              needle chunk. Using these chunks, we construct the necessary
              dataset by varying the location of the needle. We keep the needle
              at 20 varying locations per each context length to test the
              performance.
            </p>
            <p className="mb-4 leading-relaxed">
              For the dataset with a context length of 10k, we will create 20
              samples, keeping the “info” at different positions in the
              context—0, 500, 1000, 1500, .., 9000, 9500.
            </p>
            <p className="mb-4 leading-relaxed">
              Similarly, for the dataset with context length of 100k, we will
              create 20 samples where we keep the “info” at different positions
              in the context - 0, 5000, 10000, 15000, .., 90000, 95000.
            </p>
          </SectionWrapper>

          <SectionWrapper num={4} title="Experimentation">
            <p className="mb-4 text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Once LLMs, Task Types, and Datasets selected, experimentation
              begins. <br />
              The experimentation process is outlined below.
            </p>

            <p className="mb-4 leading-relaxed">
              We follow the model's prompt format, adding context in a simple
              bullet point format. For long-form text generation, we use the
              question as the prompt and apply the necessary formatting required
              by the model.
            </p>
            <p className="mb-4 leading-relaxed">
              Generation: The generations are done using private APIs, Together,
              and hosting model on HuggingFace.
            </p>
          </SectionWrapper>

          <SectionWrapper num={5} title="Evaluation">
            <p className="mb-4 text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Scoring
            </p>
            <p className="mb-4 leading-relaxed">
              After preparing the prompts and generation for each model and
              dataset, they were evaluated using ChainPoll to obtain the task
              score. ChainPoll utilizes the strong reasoning abilities of GPTs
              and employs a technique of polling the model multiple times to
              assess the accuracy of the response. This approach not only
              quantifies the extent of potential errors but also provides an
              explanation based on the given context, particularly in the case
              of RAG-based systems.
            </p>

            <div className="my-12 flex flex-col lg:flex-row lg:justify-between lg:gap-12">
              <p className="mb-4 max-w-[354px] text-[18px] leading-loose sm:text-[20px] lg:mb-0 lg:w-[45%] lg:text-[22px]">
                Chainpoll: A High Efficacy Method for LLM Hallucination
                Detection
              </p>
              <p className="mb-4 max-w-[600px] leading-relaxed lg:mb-0 lg:w-[55%]">
                A high accuracy methodology for hallucination detection that
                provides an 85% correlation with human feedback - your first
                line of defense when evaluating model outputs.
              </p>
            </div>

            <div className="border border-gray-400/40 p-2 lg:p-12">
              <p className="mb-8 leading-relaxed">
                ChainPoll: a novel approach to hallucination detection that is
                substantially more accurate than any metric we’ve encountered in
                the academic literature. Across a diverse range of benchmark
                tasks, the ChainPoll outperforms all other methods – in most
                cases, by a huge margin.
                <br />
                ChainPoll dramatically out-performs a range of published
                alternatives – including SelfCheckGPT, GPTScore, G-Eval, and
                TRUE – in a head-to-head comparison on RealHall.
                <br />
                ChainPoll is also faster and more cost-effective than most of
                the metrics listed above.
                <br />
                Unlike all other methods considered here, ChainPoll also
                provides human-readable verbal justifications for the judgments
                <br />
                it makes, via the chain-of-thought text produced during
                inference.
                <br />
                Though much of the research literature concentrates on the the
                easier case of closed-domain hallucination detection, we show
                that ChainPoll is equally strong when detecting either
                open-domain or closed domain hallucinations. We develop versions
                of ChainPoll specialized to each of these cases:
                ChainPoll-Correctness for open-domain and ChainPoll-Adherence
                for closed-domain.
              </p>
              <ChainPollTable2024
                theaderOne={'Metric'}
                theaderTwo={'Aggregate AUROC'}
                items={metricItems}
              />
            </div>

            <div className="mt-12 border bg-[#F8F9FF] px-12">
              <div className="my-12 flex flex-col lg:flex-row lg:justify-between lg:gap-12">
                <p className="max-w-[354px] text-[18px] font-semibold md:text-[24px] lg:w-[45%] lg:text-[28px] xl:text-[32px]">
                  How does this work?
                </p>
                <p className="mb-4 max-w-[500px] leading-relaxed lg:mb-0 lg:w-[55%]">
                  Chainpoll piggybacks on the strong reasoning power of your
                  LLMs, but further leverages a chain of thought technique to
                  poll the model multiple times to judge the correctness of the
                  response. This technique not only provides a metric to
                  quantify the degree of potential hallucinations, but also
                  provides an explanation based on the context provided, in the
                  case of RAG based systems.
                </p>
              </div>

              <div className="max-w-full">
                <MethodologyHowDoesItWork className="w-full max-w-full" />
              </div>
            </div>

            <div className="relative z-10 mb-6 mt-20 h-fit w-full overflow-hidden rounded-xl bg-smoky-black p-8 md:px-8 md:py-20">
              <div className="absolute -bottom-[30%] left-[25%] -z-10 h-2/3 w-2/3 bg-gradient-hero blur-[9rem]"></div>
              <div className="flex w-full flex-col items-center gap-4">
                <h3 className="text-center text-3xl font-medium leading-tight text-white md:w-[50%] md:text-4xl">
                  <span>Learn More</span>
                </h3>
                <div className="flex w-full flex-col flex-nowrap items-center justify-center gap-4 pt-4 md:w-fit md:flex-row md:gap-10">
                  <a
                    href="https://arxiv.org/abs/2406.00975v2"
                    target="_blank"
                    rel="noopener noreferrer"
                    className="w-full min-w-52 flex-1 whitespace-nowrap rounded-lg bg-white px-6 py-3 text-center text-base font-semibold text-primary-600 hover:opacity-80 md:w-fit"
                  >
                    Read the Paper
                  </a>
                  <a
                    className="btn-gradient-border w-full min-w-52 flex-1 whitespace-nowrap !px-6 !py-3 text-center md:w-fit"
                    href="/blog/webinar-the-future-of-enterprise-genai-evaluations"
                  >
                    Watch the Webinar
                  </a>
                </div>
              </div>
            </div>

            <p className="mt-16 text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Evaluation
            </p>
            <p className="mb-4 leading-relaxed">
              We selected an LLM-based evaluation to keep the approach scalable.
              ChainPoll powers the metrics used to evaluate output propensity
              for hallucination.
            </p>

            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Task score
            </p>
            <p className="mb-4 leading-relaxed">
              The final score shown is calculated as the mean of the score for
              each task dataset. The dataset score is the mean of the ChainPoll
              score for each sample.
            </p>

            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Learn more
            </p>
            <p className="mb-4 leading-relaxed">
              We have developed a comprehensive set of RAG metrics to cover
              various evaluation aspects of these models. Our{' '}
              <a
                target="_blank"
                rel="noopener noreferrer"
                href="https://docs.rungalileo.io/galileo/gen-ai-studio-products/guardrail-store"
              >
                documentation
              </a>{' '}
              provides a detailed breakdown of each RAG metric and our
              methodologies.
            </p>

            <p className="mb-4 mt-16 text-[18px] font-semibold md:text-[24px] lg:text-[28px] xl:text-[32px]">
              About Context Adherence
            </p>
            <p className="mb-4 leading-relaxed">
              Context Adherence evaluates the degree to which a model's response
              aligns strictly with the given context, serving as a metric to
              gauge closed-domain hallucinations, wherein the model generates
              content that deviates from the provided context.
            </p>
            <p className="mb-4 leading-relaxed">
              The higher the Context Adherence score (i.e., it has a value of 1
              or close to 1), the more likely the response is to contain only
              information from the context provided to the model.
            </p>
            <p className="mb-4 leading-relaxed">
              The lower the Context Adherence score (ie., it has a value of 0 or
              close to 0), the response is more likely to contain information
              not included in the context provided to the model.
            </p>
            <p className="mb-4 leading-relaxed">
              These metrics are powered by ChainPoll, a hallucination detection
              methodology developed by Galileo Labs. You can read more about
              ChainPoll here:{' '}
              <a target="_blank" href="https://arxiv.org/abs/2310.18344">
                https://arxiv.org/abs/2310.18344
              </a>
            </p>
          </SectionWrapper>
          <SectionWrapper
            num={6}
            title="How to use the Hallucination Index for LLM selection?"
          >
            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Task Alignment
            </p>
            <p className="mb-4 leading-relaxed">
              Begin by identifying which of our benchmarking task types aligns
              most closely with your specific application.
            </p>

            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Top 3 Model Selection
            </p>
            <p className="mb-4 leading-relaxed">
              Based on your criteria, carefully select the three top-performing
              models for your identified task. Consider factors such as
              performance, cost, and privacy with your objectives.
            </p>

            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Exploration of New Models
            </p>
            <p className="mb-4 leading-relaxed">
              Extend your model pool by adding any additional models you believe
              could deliver strong performance in your application context. This
              proactive approach allows for a more comprehensive evaluation.
            </p>

            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Data Preparation
            </p>
            <p className="mb-4 leading-relaxed">
              Prepare a high-quality evaluation dataset using real-world data
              specific to your task. This dataset should be representative of
              the challenges and nuances to be faced in production.
            </p>

            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Performance Evaluation
            </p>
            <p className="mb-4 leading-relaxed">
              Execute a thorough evaluation of the selected models using your
              prepared dataset. Assess their performance based on relevant
              metrics, ensuring a comprehensive understanding of each model's
              strengths and weaknesses.
            </p>
          </SectionWrapper>
        </div>
      </section>
      <PreFooter />
    </section>
  );
};

const MethodologyPage: React.FC<PageProps> = () => {
  return (
    <MarketingLayout>
      <LlmHero />
    </MarketingLayout>
  );
};

export default MethodologyPage;

export const Head = () => (
  <SEO
    title={`Methodology | LLM Hallucination Index`}
    description={`Why we built LLM Hallucination Index, use cases, and metrics`}
  />
);
