RAG-Evaluation-Dataset-JAに関する備忘録

RAG-Evaluation-Dataset-JA

Last updated at 2025-04-11Posted at 2025-04-10

ダウンロード

git clone https://huggingface.co/datasets/allganize/RAG-Evaluation-Dataset-JA

# %%
#!pip install pandas

# %%
import pandas as pd

df = pd.read_csv("RAG-Evaluation-Dataset-JA/documents.csv")
df

# %%
import os
import requests

for index, row in df.iterrows():
    url = row['url']
    file_name = row['file_name']
    print(url)
    file = f"data/{file_name}"
    if os.path.exists(file):
        continue

    try:
        buffer = requests.get(url=url, timeout=10)
        with open(file=file, mode="wb") as f:
            f.write(buffer.content)
    except Exception as e:
        print(f"{e}")

# %%

Contextの追加

# %%
import pandas as pd

df = pd.read_csv("RAG-Evaluation-Dataset-JA/rag_evaluation_result.csv")
df

# %%
import os
from langchain_community.document_loaders import PyPDFLoader

contexts = []
for index, row in df.iterrows():
    target_file_name = row['target_file_name']
    target_page_no = row['target_page_no']
    file_path = f"RAG-Evaluation-Dataset-JA/data/{target_file_name}"
    print(file_path)
    if not os.path.exists(file_path):
        contexts.append("")
        continue

    try:
        loader = PyPDFLoader(file_path=file_path)
        documents = loader.load_and_split()
        context = documents[target_page_no-1].page_content
        contexts.append(context)
    except Exception as e:
        contexts.append("")
        continue
len(context)

# %%
len(contexts)

# %%
df['context'] = contexts
df

# %%
import csv

df.to_csv("RAG-Evaluation-Dataset-JA.csv", index=False, quoting=csv.QUOTE_ALL)

Generate_textの追加

# %%
import os
from dotenv import load_dotenv

load_dotenv(override=True)

# %%
from azure.identity import ClientSecretCredential, get_bearer_token_provider

tenant_id = os.environ.get('AZURE_TENANT_ID')
client_id = os.environ.get('AZURE_CLIENT_ID')
client_secret = os.environ.get('AZURE_CLIENT_SECRET')
credential = ClientSecretCredential(
    tenant_id=tenant_id,
    client_id=client_id,
    client_secret=client_secret
)

# %%
scopes = "https://cognitiveservices.azure.com/.default"
azure_ad_token_provider = get_bearer_token_provider(credential, scopes)

# %%
from langchain_openai import AzureChatOpenAI

azure_endpoint=os.environ.get('AZURE_OPENAI_ENDPOINT')
azure_deployment = os.environ.get("AZURE_OPENAI_DEPLOYMENT_NAME")
api_version = os.environ.get("API_VERSION")
temperature = 0
max_tokens = 4096
llm = AzureChatOpenAI(
    azure_endpoint=azure_endpoint,
    api_version=api_version,
    azure_deployment=azure_deployment,
    azure_ad_token_provider=azure_ad_token_provider,
    temperature=temperature,
    max_tokens=max_tokens,
)

# %%
from langchain_core.prompts import PromptTemplate

template = """# 指示:
与えられた文脈をもとに質問に回答してください。

# 文脈:
{context}

# 質問:
{question}

# 回答:
"""
prompt_template = PromptTemplate.from_template(template=template)

# %%
import pandas as pd

filepath = "RAG-Evaluation-Dataset-JA.csv"
df = pd.read_csv(filepath_or_buffer=filepath)
df

# %%
df.dropna(subset=['context'], inplace=True)
df

# %%
import csv

generated_text = []
for index, row in df.iterrows():
    print(index)
    context = row.context
    question = row.question
    try:
        response = llm.invoke(input=prompt_template.format(context=context, question=question))
        generated_text.append(response.content)
    except Exception as e:
        generated_text.append("")
len(generated_text)

# %%
df['generated_text'] = generated_text
df.to_csv(f"RAG-Evaluation-Dataset-JA-generated_text.csv", index=False, quoting=csv.QUOTE_ALL)

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up