import React, { useState, useRef, useEffect } from 'react';
import { PageProps } from 'gatsby';
import { useStaticQuery, graphql } from 'gatsby';
import { useInView } from 'react-intersection-observer';
import { GatsbyImage } from 'gatsby-plugin-image';

import Fade from '@/components/common/fade';

import MarketingLayout from '@/components/common/hl-marketing-layout';
import FormSection from '@/components/hallucination-index-2023/form-section';
import SEO from '@/components/seo';

import Magnifier from '@/assets/icon-magnifier.svg';
import Bubble from '@/assets/icon-bubble.svg';
import Inspect from '@/assets/icon-inspect.svg';
import Check from '@/assets/icon-check.svg';

import Logo from '@/assets/logo-current-color.svg';
// import Logo from '@/assets/logo-v2.svg';

import Prefooter from '@/components/hallucination-index-2023/prefooter';
import ShowMoreItem from '@/components/common/show-more';
import ExternalArrow from '@/assets/learn-more.svg';
import {
  ScrollWrap,
  TableScrollWrap,
  ChainPollTable,
  DataTable,
} from '@/components/common/scroll-wrap';

const CodeBlock = () => {
  return (
    <pre className="mt-4 overflow-auto border border-hi-3 bg-hi-2 p-4">
      <code>
        {`{
  "max_new_tokens": 1024, 
  "temperature":0.1, 
  "top_p":0.9, 
  "do_sample":True,
  "seed": 0,
  "best_of": 1,
}`}
      </code>
    </pre>
  );
};

import {
  metricItems,
  modelsByCreators,
} from '@/utils/hallucination-index-data';

const aboutIndex = [
  {
    title: 'Context',
    text: 'While there is no shortage of benchmark indices in the market, very few measure hallucinations. Measuring hallucinations is difficult, as LLM performance varies based on use-case, dataset, and more. Furthermore, there’s no consistent framework and set of metrics for measuring and evaluating hallucinations. ',
  },
  {
    title: 'Design',
    text: 'Hallucination Index is built to measure the efficacy of LLMs across 3 common use-cases - Q&A without context, Q&A with context, and long-form text generation. It measures the hallucination of models in free form text.',
  },
  {
    title: 'Execution',
    text: 'The Index ranks 11 leading LLMs based on their propensity to hallucinate for each use cases. We select 7 standard datasets to create tasks which match the requirement in the industry. LLMs are evaluated using 2 metrics, Correctness and Context Adherence which are built with state of the art technique ChainPoll.',
  },
];

const dataTableItems = [
  {
    title: 'Definition',
    td1: 'Generates short answer for a question without context',
    td2: 'Generates an answer for a question with context',
    td3: 'Generates long answer based on a prompt',
  },

  {
    title: 'Datasets',
    td1: `
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/truthful_qa">TruthfulQA</a>,
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/trivia_qa">TriviaQA</a>
    `,
    td2: `
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/narrativeqa">NarrativeQA</a>,
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/drop">DROP</a>,
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/ms_marco">MS Marco</a>,
      <a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/hotpot_qa/viewer/distractor/validation">HotpotQA distractor test</a>
    `,
    td3: `<a class="text-hi-100 font-medium hover:underline" target="_blank" href="https://huggingface.co/datasets/OpenAssistant/oasst1">Open Assistant</a>`,
  },
];

const SectionWrapper = ({ children, num, title }) => {
  return (
    <div className="section-wrapper border-t border-hi-3 py-8 sm:py-10 md:py-12 lg:py-14 xl:py-16">
      <div className="sticky top-14 z-10 -mx-4 flex flex-row items-center bg-white py-3 lg:top-20">
        <div className="w-auto shrink px-4">
          <div className="">
            <div className="flex h-10 w-10 items-center justify-center rounded-lg border border-hi-3 font-serif text-[20px] font-bold md:text-[24px] lg:text-[28px]">
              {num}
            </div>
          </div>
        </div>
        <div className="w-auto w-full grow md:px-4">
          <h3 className="font-serif text-[18px] font-normal md:text-[24px] lg:text-[28px] xl:text-[32px]">
            {title}
          </h3>
        </div>
      </div>
      <div className="-mx-4 flex flex-col bg-white md:flex-row md:flex-nowrap">
        <div className="hidden w-auto shrink px-4 md:block">
          <div className="">
            <div className="h-10 w-10 font-serif lg:text-[28px]"></div>
          </div>
        </div>
        <div className="w-auto w-full grow px-4">
          <hr className="my-2 opacity-0" />
          {children}
        </div>
      </div>
    </div>
  );
};

const ShowMoreSection = () => {
  return (
    <div className="space-y-6 lg:space-y-10">
      <div>
        <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
          Q&A without RAG
        </p>
        <hr className="mb-4 mt-2 border-t border-hi-3 xl:mb-6 xl:mt-4" />
        <ul className="list-disc space-y-4 pl-8">
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://arxiv.org/abs/2109.07958"
            >
              TruthfulQA
            </a>
            : A benchmark to measure biases of large language models using
            question answering task.
          </li>
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://nlp.cs.washington.edu/triviaqa/"
            >
              TriviaQA
            </a>
            : A reading comprehension dataset containing
            question-answer-evidence triples. 
          </li>
        </ul>
      </div>
      <div>
        <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
          Q&A with RAG
        </p>
        <hr className="mb-4 mt-2 border-t border-hi-3 xl:mb-6 xl:mt-4" />
        <ul className="list-disc space-y-4 pl-8">
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://huggingface.co/datasets/narrativeqa"
            >
              NarrativeQA
            </a>
            : A dataset of stories and corresponding questions designed to test
            reading comprehension, especially on long documents.
          </li>
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://huggingface.co/datasets/mrqa"
            >
              DROP
            </a>
            : Reading comprehension benchmark which requires Discrete Reasoning
            Over the content of Paragraphs. Answering requires resolving
            references in a question, perhaps to multiple input positions, and
            performing discrete operations over them (such as addition,
            counting, or sorting). 
          </li>
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://microsoft.github.io/msmarco/"
            >
              Microsoft MS Macro
            </a>
            : A dataset containing queries and paragraphs with relevance labels.
          </li>
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://huggingface.co/datasets/hotpot_qa"
            >
              HotpotQA
            </a>
            : A dataset with Wikipedia-based question-answer pairs that require
            finding and reasoning over multiple supporting documents to answer. 
          </li>
        </ul>
      </div>
      <div>
        <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
          Long-form text Generation
        </p>
        <hr className="mb-4 mt-2 border-t border-hi-3 xl:mb-6 xl:mt-4" />
        <ul className="list-disc space-y-4 pl-8">
          <li>
            <a
              className="text-hi-100 hover:underline"
              target="_blank"
              href="https://github.com/LAION-AI/Open-Assistant"
            >
              OpenAssistant
            </a>
            : A human-generated, human-annotated assistant-style conversation
            corpus. It covers factual questions on varied domains.
          </li>
        </ul>
      </div>
    </div>
  );
};

const LlmHero = () => {
  const query = useStaticQuery(graphql`
    {
      taskPerformance: allTaskperformanceCsv {
        edges {
          node {
            Model
            Long_form_text_generation
            QA_with_RAG
            QA_without_RAG
          }
        }
      }
    }
  `);

  const refSlider = useRef(null);

  // add scroll left and right functionality when dragging the refSlider

  useEffect(() => {
    if (refSlider.current) {
      const slider = refSlider.current;
      let isDown = false;
      let startX;
      let scrollLeft;

      slider.addEventListener('mousedown', (e) => {
        isDown = true;
        slider.classList.add('active');
        startX = e.pageX - slider.offsetLeft;
        scrollLeft = slider.scrollLeft;
      });

      slider.addEventListener('mouseleave', () => {
        isDown = false;
        slider.classList.remove('active');
      });

      slider.addEventListener('mouseup', () => {
        isDown = false;
        slider.classList.remove('active');
      });

      slider.addEventListener('mousemove', (e) => {
        if (!isDown) return;
        e.preventDefault();
        const x = e.pageX - slider.offsetLeft;
        const walk = (x - startX) * 1; //scroll-fast
        slider.scrollLeft = scrollLeft - walk;
      });
    }
  }, []);

  const { joined, taskPerformance } = query;

  const chartData = taskPerformance.edges.map((item) => item.node);

  const { ref, inView } = useInView({
    triggerOnce: false,
    rootMargin: '100px 0px',
  });

  return (
    <section className="relative pt-14 lg:pt-20">
      {/* header */}

      <div className="flex flex-col justify-end">
        <div className="mx-auto h-full w-full max-w-[1280px] pb-14 pt-20">
          <div className="flex h-full flex-wrap items-end justify-between">
            <div className="w-auto px-4">
              <h1 className="text-[32px] font-normal leading-none sm:text-[40px] md:text-[48px] lg:text-[56px] xl:text-[64px]">
                Methodology
              </h1>
              <p className="mt-8 text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
                The Hallucination Index is an ongoing initiative to evaluate and
                rank the largest and most popular LLMs propensity to hallucinate
                across common task types. The models were evaluated using a
                diverse set of datasets, chosen for their popularity and ability
                to challenge the models' abilities to stay on task. Below is the
                methodology used to create the Hallucination Index.
              </p>
            </div>
          </div>
        </div>
      </div>

      <section className="">
        <div className="mx-auto h-full max-w-[1280px] px-4">
          <SectionWrapper num={1} title="Model Selection">
            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              The Hallucination Index evaluated the largest and most popular
              LLMs available today. LLMs were chosen by surveying popular LLM
              repos, leaderboards, and industry surveys. The LLMs selected
              represent a combination of open-source and closed-source models of
              varying sizes. This domain is evolving, with new models being
              released on a weekly basis.
            </p>

            <p className="mt-4">
              Hallucination Index will be updated quarterly. To see an LLM added
              to the Hallucination Index{' '}
              <a href="#hallucination-form" className="text-hi-100">
                reach out here.
              </a>
            </p>

            <hr className="my-4 opacity-0" />

            <div className="mt-4">
              <div className="relative -mr-4 overflow-hidden">
                {/* // add drap left and right via ref  */}
                <div
                  className="hide-scroll-bar relative cursor-grab overflow-auto active:cursor-grabbing"
                  ref={refSlider}
                >
                  <div className="flex flex-nowrap gap-4 pb-4">
                    {modelsByCreators.map((i, idx) => {
                      return (
                        <div className="flex-none basis-[248px] last:pr-4 md:basis-[256px] lg:basis-[268px]">
                          <div className="h-full border border-hi-3 bg-hi-2">
                            <div className="flex flex-row items-center space-x-2 border-b border-hi-3 bg-hi-2 p-4 lg:p-5">
                              <div>
                                <img
                                  width={24}
                                  height={24}
                                  src={`/creators/${i.creator}`}
                                />
                              </div>
                              <p className="">{i['creator-name']}</p>
                            </div>
                            <div className="p-4 lg:p-5">
                              <ul className="list-disc space-y-2.5 pl-4">
                                {i.models.map((m, idx) => {
                                  return (
                                    <li className="opacity-80">
                                      <a
                                        className=""
                                        href={`/hallucinationindex-2023/${m.Model.toLocaleLowerCase().replace(
                                          '.',
                                          '',
                                        )}`}
                                      >
                                        {m.Model}
                                      </a>
                                    </li>
                                  );
                                })}
                              </ul>
                            </div>
                          </div>
                        </div>
                      );
                    })}
                  </div>
                </div>
              </div>
            </div>
          </SectionWrapper>

          <SectionWrapper num={2} title="Task Type Selection">
            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Next, LLMs were tested across three common task types to observe
              their performance. In selecting task types, we selected tasks
              relevant to developers and end-users and tested each LLM’s ability
              to operate with and without context.
            </p>
            <hr className="my-4 opacity-0" />
            <p className="text-[18px] font-medium leading-loose sm:text-[20px] lg:text-[22px]">
              Task Types Selected:
            </p>
            <hr className="my-4 border-t border-hi-3 xl:my-6" />

            <ul className="list-disc space-y-3 pl-4">
              <li>
                <p className="mb-1 font-semibold">
                  Question & Answer without RAG
                </p>
                <p className="leading-relaxed">
                  A model that, when presented with a question, relies on the
                  internal knowledge and understanding that the AI model has
                  already acquired during its training. It generates answers
                  based on patterns, facts, and relationships it has learned
                  without referencing external sources of information.
                </p>
              </li>
              <li>
                <p className="mb-1 font-semibold">
                  Question & Answer with RAG{' '}
                </p>
                <p className="leading-relaxed">
                  A model that, when presented with a question, uses retrieved
                  information from a given dataset, database, or set of
                  documents to provide an accurate answer. This approach is akin
                  to looking up information in a reference book or searching a
                  database before responding.
                </p>
              </li>
              <li>
                <p className="mb-1 font-semibold">Long-Form Text Generation</p>
                <p className="leading-relaxed">
                  Using generative AI to create extensive and coherent pieces of
                  text such as reports, articles, essays, or stories. For this
                  use-case, AI models are trained on large datasets to
                  understand context, maintain subject relevance, and mimic a
                  natural writing style over longer passages.
                </p>
              </li>
            </ul>
          </SectionWrapper>

          <SectionWrapper num={3} title="Dataset Selection">
            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              The Hallucination Index assesses LLM performance by leveraging 7
              popular datasets. The datasets effectively challenge each LLM's
              capabilities relevant to the task at hand. For the Q&A with RAG
              task we convert the query and document to form the input prompt
              with context. While for Q&A without RAG & Long form text
              generation we use the question as the prompt with formatting
              required for the respective model.
            </p>
            <hr className="my-4 opacity-0" />

            {/* <hr className="my-4 xl:my-6 border-t border-hi-3" /> */}

            <TableScrollWrap>
              <DataTable
                theaderOne={'Tasks Type'}
                theaderTwo={'QA without RAG'}
                theaderThree={'QA with RAG'}
                theaderFour={'Long Form Text Generation'}
                items={dataTableItems}
              />
            </TableScrollWrap>

            <hr className="my-4 opacity-0" />

            <ShowMoreItem isOpen={true} title="More Info">
              <ShowMoreSection />
            </ShowMoreItem>
          </SectionWrapper>

          <SectionWrapper num={4} title="Experimentation">
            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              Once LLMs, Task Types, and Datasets were selected, experimentation
              began. The experimentation process is outlined below.
            </p>
            <hr className="my-4 border-t border-hi-3 xl:my-6" />

            <ul className="list-decimal space-y-3 pl-4">
              <li>
                <b>Prompt formatting:</b> The prompts were formatted differently
                as per the need of the task.
              </li>
              <li>
                <b>Prompt format for Q&A without RAG:</b> We construct zero-shot
                prompts without any chain of thought (CoT) instructions. We use
                the prompt format as per the model.
              </li>
              <li>
                <b>Prompt format for Q&A with RAG:</b> We construct zero-shot
                prompts without any chain of thought (CoT) instructions. We use
                the prompt format as per the model. We add the context in the
                prompt with a simple bullet point format.
              </li>
              <li>
                <b>Prompt format for long-form text generation:</b> We use the
                question as the prompt along with prompt formatting required for
                the model.
              </li>
              <li>
                <b>Generation:</b> The generations are done using the same text
                generation configuration using TGI server with bitsandbytes NF4
                quantisation.
              </li>
            </ul>
          </SectionWrapper>

          <SectionWrapper num={5} title="Evaluation">
            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              <b>Scoring</b>
              <br />
              After the prompts and generation were ready for each model and
              dataset, they were scored with ChainPoll to get the task score.
            </p>

            <hr className="my-4 border-t border-hi-3 xl:my-6" />

            <p className="mb-3">
              <b>Evaluation</b>
              <br />
              We selected an LLM based evaluation to keep the approach scalable.
              The metrics used to evaluate output propensity for hallucination
              are powered by ChainPoll.
            </p>

            <p className="mb-3">
              Existing benchmarks stick to traditional statistical metrics, but
              detecting hallucinations reliably has more to do with detecting
              qualitative nuances in the model’s output that are specific to the
              task types.
            </p>
            <p className="mb-3">
              While{' '}
              <a
                className="text-primary hover:underline"
                target="_blank"
                href="https://www.microsoft.com/en-us/research/publication/gpteval-nlg-evaluation-using-gpt-4-with-better-human-alignment/"
              >
                asking GPT-4
              </a>{' '}
              to detect hallucinations is a popular (albeit expensive) method to
              rely on,{' '}
              <a
                className="text-primary hover:underline"
                target="_blank"
                href="https://arxiv.org/abs/2310.18344"
              >
                ChainPoll
              </a>{' '}
              has emerged as a superior method to detect hallucinations from
              your model’s outputs, with a high correlation against human
              benchmarking.
            </p>

            <p className="mb-3">
              <b>Annotation</b>
              <br />
              We leveraged human annotation to confirm the reliability of metric
              for each task type in our ChainPoll experiments as well as Index
              experiments. The ChainPoll paper makes use of RealHall dataset
              which consists of open and closed domain prompts.
            </p>

            <p className="">
              <b>Task score</b>
              <br />
              The final score shown in the bar chart is calculated as the mean
              of score for each dataset of the task. The dataset score is the
              mean of ChainPoll score for each sample.
            </p>

            <hr className="my-4 opacity-0" />

            <ChainPollTable
              theaderOne={'Metric'}
              theaderTwo={'Aggregate AUROC'}
              items={metricItems}
            />

            <p className="my-4 text-sm">
              Hallucination detection performance on RealHall, averaged across
              datatasets
            </p>
            <p className="mb-1 text-sm lg:text-base">
              <a
                target="_blank"
                href="https://arxiv.org/abs/2310.18344"
                className="group flex items-center font-medium text-hi-100"
              >
                Learn more about ChainPoll
              </a>
            </p>

            <hr className="my-4 opacity-0" />

            <p className="text-[18px] font-medium leading-loose sm:text-[20px] lg:text-[22px]">
              Evaluation Metrics
            </p>
            <hr className="my-4 border-t border-hi-3 xl:my-6" />

            <ul className="list-disc space-y-6 pl-4">
              <li>
                <p className="mb-4 font-semibold">
                  <a
                    href="https://docs.rungalileo.io/galileo/how-to-and-faq/ml-research-algorithms/guardrail-metrics/factuality"
                    className="text-hi-100 hover:underline"
                  >
                    Correctness
                  </a>
                  :
                </p>
                <p>
                  Measures whether a given model response is factual or not.
                  Correctness uncovers open-domain hallucinations - factual
                  errors that do not relate to any specific documents or
                  context.
                </p>
                <ul className="mt-4 list-disc space-y-4 pl-4 opacity-80">
                  <li>
                    The higher the Correctness score (i.e., it has a value of 1
                    or close to 1), the higher the probability the response is
                    accurate.
                  </li>
                  <li>
                    The lower the Correctness score (i.e., it has a value of 0
                    or close to 0), the higher the probability of hallucination
                    and factual errors
                  </li>
                </ul>
              </li>

              <li>
                <p className="mb-4 font-semibold">
                  <a
                    className="text-hi-100 hover:underline"
                    href="https://docs.rungalileo.io/galileo/how-to-and-faq/ml-research-algorithms/guardrail-metrics/groundedness"
                  >
                    Context Adherence
                  </a>
                  :
                </p>
                <p>
                  Context Adherence evaluates the degree to which a model's
                  response aligns strictly with the given context, serving as a
                  metric to gauge closed-domain hallucinations, wherein the
                  model generates content that deviates from the provided
                  context.
                </p>
                <ul className="mt-4 list-disc space-y-4 pl-4 opacity-80">
                  <li>
                    The higher the Context Adherence score (i.e., it has a value
                    of 1 or close to 1), the response is more likely to only
                    contain information from the context provided to the model.{' '}
                  </li>
                  <li>
                    The lower the Context Adherence score (ie., it has a value
                    of 0 or close to 0), the response is more likely to contain
                    information not included in the context provided to the
                    model.{' '}
                  </li>
                </ul>
              </li>
            </ul>

            <hr className="my-4 opacity-0" />
            {/* <p className="leading-loose text-[18px] sm:text-[20px] lg:text-[22px] font-medium">
              Evaluation Methodology
            </p> */}
            <hr className="my-4 border-t border-hi-3 xl:my-6" />
            <ul>
              <li>
                These metrics are powered by ChainPoll, a hallucination
                detection methodology developed by Galileo Labs. You can read
                more about ChainPoll here:{' '}
                <a
                  className="text-hi-100 hover:underline"
                  target="_blank"
                  href="https://arxiv.org/abs/2310.18344"
                >
                  https://arxiv.org/abs/2310.18344
                </a>
              </li>
            </ul>
          </SectionWrapper>
          <SectionWrapper
            num={6}
            title="How to use the Hallucination Index for LLM selection?"
          >
            <p className="text-[18px] leading-loose sm:text-[20px] lg:text-[22px]">
              While our model ranking provides valuable insights for various
              tasks, we acknowledge that it does not cover all applications and
              domains comprehensively. To address this, we have plans to
              incorporate additional models and datasets in the future. To
              request a specific model,{' '}
              <a href="#hallucination-form">get in touch</a> below. In the
              meantime, here's a suggested approach to refine your model
              selection process:
            </p>
            <hr className="my-4 opacity-0" />

            <hr className="my-4 border-t border-hi-3 xl:my-6" />

            <ul className="list-disc space-y-3 pl-4">
              <li>
                <p className="mb-1 font-semibold">Task Alignment </p>
                <p className="leading-relaxed">
                  Begin by identifying which of our benchmarking task types
                  aligns most closely with your specific application..
                </p>
              </li>
              <li>
                <p className="mb-1 font-semibold">Top 3 Model Selection</p>
                <p className="leading-relaxed">
                  Based on your criteria, carefully select the three
                  top-performing models for your identified task. Consider
                  factors such as performance, cost, and privacy with your
                  objectives.
                </p>
              </li>
              <li>
                <p className="mb-1 font-semibold">Exploration of New Models</p>
                <p className="leading-relaxed">
                  Extend your model pool by adding any additional models you
                  believe could deliver strong performance in your application
                  context. This proactive approach allows for a more
                  comprehensive evaluation.
                </p>
              </li>
              <li>
                <p className="mb-1 font-semibold">Data Preparation</p>
                <p className="leading-relaxed">
                  Prepare a high-quality evaluation dataset using real-world
                  data specific to your task. This dataset should be
                  representative of the challenges and nuances to be faced in
                  production.
                </p>
              </li>
              <li>
                <p className="mb-1 font-semibold">Performance Evaluation</p>
                <p className="leading-relaxed">
                  Execute a thorough evaluation of the selected models using
                  your prepared dataset. Assess their performance based on
                  relevant metrics, ensuring a comprehensive understanding of
                  each model's strengths and weaknesses.
                </p>
              </li>
            </ul>

            <hr className="my-4 opacity-0" />

            <p>
              By following these steps, you'll gain a nuanced perspective on
              model suitability for your application, enabling you to make
              informed decisions in selecting the most appropriate model. Stay
              tuned for updates as we expand our model offerings to further
              cater to diverse applications and domains.
            </p>
          </SectionWrapper>
        </div>
      </section>
      <FormSection />
      <Prefooter />
    </section>
  );
};

const MethodologyPage: React.FC<PageProps> = () => {
  return (
    <MarketingLayout>
      <LlmHero />
    </MarketingLayout>
  );
};

export default MethodologyPage;

export const Head = () => (
  <SEO
    title={`Methodology | LLM Hallucination Index`}
    description={`Why we built LLM Hallucination Index, use cases, and metrics`}
  />
);
