{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import re\n", "import PyPDF2\n", "import json\n", "\n", "# import nltk\n", "# nltk.download('punkt')\n", "\n", "from dotenv import load_dotenv\n", "from openai import OpenAI\n", "from tqdm import tqdm\n", "\n", "# Load environment variables from the .envrc file\n", "load_dotenv('../.envrc')\n", "\n", "client = OpenAI(\n", " api_key=os.getenv(\"OPENAI_API_KEY\"),\n", ")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from elasticsearch import Elasticsearch\n", "\n", "# Create an Elasticsearch client instance\n", "es = Elasticsearch(\n", " [{'scheme': 'http', 'host': 'localhost', 'port': 9200}]\n", ")\n", "\n", "index_name = 'enhanced_stock_analyzer'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "index_mapping = {\n", " 'mappings': {\n", " 'properties': {\n", " 'source': {'type': 'keyword'},\n", " 'chunk_id': {'type': 'keyword'},\n", " 'content': {'type': 'text'},\n", " 'summary': {'type': 'text'},\n", " 'key_topics': {'type': 'text'},\n", " 'embedding': {\n", " 'type': 'dense_vector',\n", " 'dims': 384 # Adjust based on your embedding dimensions\n", " }\n", " }\n", " }\n", "}\n", "\n", "# Create or update the index\n", "if not es.indices.exists(index=index_name):\n", " es.indices.create(index=index_name, body=index_mapping)\n", "else:\n", " es.indices.put_mapping(index=index_name, body=index_mapping)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "668" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# letters_path = '../data/buffet_letters.json'\n", "# with open(letters_path, 'r') as json_file:\n", "# processed_letters = json.load(json_file)\n", "\n", "# report_path = '../data/annual_reports.json'\n", "# with open(report_path, 'r') as json_file:\n", "# processed_reports = json.load(json_file)\n", "\n", "\n", "# for item in processed_letters:\n", "# metdt = item['metadata'].split('\\n')\n", "# item['summary'] = metdt[0][len(\"summary:\"):].strip()\n", "# item['key_topics'] = [i.strip() for i in metdt[1][len(\"key_topics:\"):].strip().split(',')]\n", "# del item['metadata']\n", " \n", "# for item in processed_reports:\n", " \n", "# metdt = item['metadata'].split('\\n')\n", "\n", "# item['summary'] = metdt[0][len(\"summary:\"):].strip()\n", "# item['key_topics'] = [i.strip() for i in metdt[1][len(\"key_topics:\"):].strip().split(',')]\n", "# del item['metadata']\n", "\n", "# item['source'] = item['ticker'] + \" annual \" + item['source']\n", "# del item['ticker']\n", "\n", "\n", "papers_path = '../data/research_papers.json'\n", "with open(papers_path, 'r') as json_file:\n", " processed_papers = json.load(json_file)\n", "\n", "for item in processed_papers:\n", " \n", " metdt = item['metadata'].split('\\n')\n", "\n", " item['summary'] = metdt[0][len(\"summary:\"):].strip()\n", " item['key_topics'] = [i.strip() for i in metdt[1][len(\"key_topics:\"):].strip().split(',')]\n", " del item['metadata']\n", "\n", "all_data = processed_papers #processed_letters #+ processed_reports\n", "len(all_data)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# from transformers import AutoTokenizer, AutoModel\n", "# import torch\n", "\n", "# def get_embeddings(text, model_name='yiyanghkust/finbert-pretrain'):\n", "# \"\"\"\n", "# Generates embeddings for the given text using a pre-trained model.\n", "\n", "# Parameters:\n", "# - text (str): The input text for which embeddings are to be generated.\n", "# - model_name (str): The name of the pre-trained model to use (default is 'yiyanghkust/finbert-pretrain').\n", "\n", "# Returns:\n", "# - numpy.ndarray: The [CLS] token embedding as a numpy array.\n", "# \"\"\"\n", "# # Load the tokenizer and model\n", "# tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "# model = AutoModel.from_pretrained(model_name)\n", "\n", "# # Tokenize the input text\n", "# encoded_input = tokenizer(\n", "# text,\n", "# add_special_tokens=True,\n", "# max_length=384,\n", "# padding='max_length',\n", "# truncation=True,\n", "# return_tensors='pt'\n", "# )\n", "\n", "# input_ids = encoded_input['input_ids']\n", "# attention_mask = encoded_input['attention_mask']\n", "\n", "# # Ensure the model is in evaluation mode\n", "# model.eval()\n", "\n", "# # Move inputs to GPU if available\n", "# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "# model.to(device)\n", "# input_ids = input_ids.to(device)\n", "# attention_mask = attention_mask.to(device)\n", "\n", "# # Generate embeddings\n", "# with torch.no_grad():\n", "# outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n", "\n", "# # Extract the [CLS] token embedding\n", "# cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (1, hidden_size)\n", "\n", "# # Convert to a NumPy array\n", "# cls_embedding_np = cls_embedding.cpu().numpy() # Move to CPU if necessary\n", "\n", "# return cls_embedding_np\n", "\n", "# # Example usage\n", "# # text = \"The company's revenue increased by 20% in the last quarter.\"\n", "# # embeddings = get_embeddings(text)\n", "# # print(embeddings)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/whysocurious/.local/share/virtualenvs/buffett-wisdom-rag-yofaZaKX/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", " from tqdm.autonotebook import tqdm, trange\n", "/Users/whysocurious/.local/share/virtualenvs/buffett-wisdom-rag-yofaZaKX/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", " warnings.warn(\n" ] } ], "source": [ "from elasticsearch.helpers import bulk\n", "\n", "from sentence_transformers import SentenceTransformer\n", "\n", "model = SentenceTransformer('all-MiniLM-L6-v2') #('FinLang/finance-embeddings-investopedia') #('all-MiniLM-L6-v2') #(\"philschmid/bge-base-financial-matryoshka\")\n", "\n", "def generate_actions(data):\n", " for item in data:\n", " embedding = model.encode(item['content']).tolist() #get_embeddings(item['content']).tolist() #\n", " yield {\n", " '_index': index_name,\n", " '_id': item['chunk_id'],\n", " '_source': {\n", " 'source': item['source'],\n", " 'chunk_id': item['chunk_id'],\n", " 'content': item['content'],\n", " 'summary': item.get('summary', ''),\n", " 'key_topics': item.get('key_topics', ''),\n", " 'embedding': embedding\n", " }\n", " }\n", "\n", "# Bulk index the data\n", "bulk(es, generate_actions(all_data))\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "indx = bulk(es, generate_actions(all_data))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(668, [])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "indx" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'mappings': {'properties': {'source': {'type': 'keyword'},\n", " 'chunk_id': {'type': 'keyword'},\n", " 'content': {'type': 'text'},\n", " 'summary': {'type': 'text'},\n", " 'key_topics': {'type': 'text'},\n", " 'embedding': {'type': 'dense_vector', 'dims': 384}}}}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index_mapping " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "\n", "def hybrid_search(query, index=index_name, keyword_top_k=10, final_top_k=10):\n", " \"\"\"\n", " Performs a hybrid search using keyword (multi_match) and semantic (k-NN) search, then re-ranks the results.\n", "\n", " Parameters:\n", " - query (str): The user's question.\n", " - index (str): The Elasticsearch index name.\n", " - keyword_top_k (int): Number of top documents to retrieve from keyword search.\n", " - final_top_k (int): Number of top documents to return after re-ranking.\n", "\n", " Returns:\n", " - List of dictionaries containing the re-ranked search results.\n", " \"\"\"\n", "\n", " # Step 1: Perform Keyword Search (multi_match)\n", " keyword_query = {\n", " 'size': keyword_top_k,\n", " 'query': {\n", " 'multi_match': {\n", " 'query': query,\n", " 'fields': ['content','summary^2','key_topics^2'],\n", " 'fuzziness': 'AUTO'\n", " }\n", " },\n", " '_source': ['source', 'chunk_id', 'content', 'embedding', 'summary', 'key_topics']\n", " }\n", " keyword_response = es.search(index=index, body=keyword_query)\n", " keyword_hits = keyword_response['hits']['hits']\n", "\n", " # Step 2: Perform Semantic Search (k-NN)\n", " # Generate embedding for the query\n", " query_embedding = model.encode(query).tolist()\n", "\n", " # Build the semantic search query\n", " # Semantic search using script_score\n", " semantic_query = {\n", " 'size': keyword_top_k,\n", " 'query': {\n", " 'script_score': {\n", " 'query': {'match_all': {}},\n", " 'script': {\n", " 'source': \"cosineSimilarity(params.query_vector, 'embedding') + 1.0\",\n", " 'params': {'query_vector': query_embedding}\n", " }\n", " }\n", " },\n", " '_source': ['source', 'chunk_id', 'content', 'embedding', 'summary', 'key_topics']\n", " }\n", "\n", " semantic_response = es.search(index=index, body=semantic_query)\n", " semantic_hits = semantic_response['hits']['hits']\n", "\n", " # Step 3: Combine Results\n", " combined_hits = {}\n", " # Process keyword search results\n", " for hit in keyword_hits:\n", " chunk_id = hit['_id']\n", " combined_hits[chunk_id] = {\n", " 'source': hit['_source']['source'],\n", " 'year': hit['_source'].get('year', ''),\n", " 'chunk_id': chunk_id,\n", " 'content': hit['_source']['content'],\n", " 'summary': hit['_source']['summary'],\n", " 'key_topics': hit['_source']['key_topics'],\n", " 'keyword_score': hit['_score'],\n", " 'semantic_score': 0 # Will be updated if exists in semantic_hits\n", " }\n", " # Process semantic search results\n", " for hit in semantic_hits:\n", " chunk_id = hit['_id']\n", " if chunk_id in combined_hits:\n", " combined_hits[chunk_id]['semantic_score'] = hit['_score']\n", " else:\n", " combined_hits[chunk_id] = {\n", " 'source': hit['_source']['source'],\n", " # 'year': hit['_source'].get('year', ''),\n", " 'chunk_id': chunk_id,\n", " 'content': hit['_source']['content'],\n", " 'summary': hit['_source']['summary'],\n", " 'key_topics': hit['_source']['key_topics'],\n", " 'keyword_score': 0,\n", " 'semantic_score': hit['_score']\n", " }\n", "\n", " # Step 4: Re-rank the Combined Results\n", " # Normalize scores\n", " max_keyword_score = max(hit['keyword_score'] for hit in combined_hits.values()) or 1\n", " max_semantic_score = max(hit['semantic_score'] for hit in combined_hits.values()) or 1\n", "\n", " for hit in combined_hits.values():\n", " hit['keyword_score_normalized'] = hit['keyword_score'] / max_keyword_score\n", " hit['semantic_score_normalized'] = hit['semantic_score'] / max_semantic_score\n", " # Combine scores with weights (adjust weights as needed)\n", " hit['combined_score'] = (0.6 * hit['keyword_score_normalized']) + (0.4 * hit['semantic_score_normalized'])\n", "\n", " # Sort the hits based on combined score\n", " re_ranked_hits = sorted(combined_hits.values(), key=lambda x: x['combined_score'], reverse=True)\n", "\n", " # Step 5: Return Top-K Re-Ranked Documents\n", " final_results = re_ranked_hits[:final_top_k]\n", " return final_results\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# query = \"What is mean reversion strategy ?\" #\"Explain Warren Buffett's investment strategy focusing on companies with strong moats.\" #\n", "\n", "# res = hybrid_search(query)\n", "# res" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def build_prompt(query, search_results):\n", " \"\"\"\n", " Builds a prompt for the LLM using the query and search results.\n", "\n", " Parameters:\n", " - query (str): The user's question or query.\n", " - search_results (list): List of retrieved documents.\n", " \n", " Returns:\n", " - The formatted prompt string.\n", " \"\"\"\n", " # Instruction to the LLM\n", " instruction = (\n", " \"You are a financial analyst assistant with deep knowledge of trading strategies and behavioral finance. \"\n", " \"Using the provided context, answer the user's question. \"\n", " \"Use only the facts from the context when answering the question.\"\n", " \"If the context is insufficient, let the user know. \"\n", " \"Provide clear, concise explanations, and include relevant insights from the research papers.\\n\\n\"\n", " )\n", "\n", " # Build context from search results\n", " context = \"\"\n", " for result in search_results:\n", " # source = result['source']\n", " # year = result.get('year', '')\n", " content = result['content']\n", " summary = result['summary']\n", " key_topics = result['key_topics']\n", " context += f\"Content: {content}\\n Summary: {summary}\\n Key topics: {key_topics}\\n\\n\" #\n", "\n", "\n", " # Assemble the prompt\n", " prompt = f\"{instruction}Context:\\n{context}\\nQuestion: {query}\\nAnswer:\"\n", " return prompt\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# print (build_prompt(query, res))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def call_llm(prompt, model='gpt-4o-mini'):\n", " response = client.chat.completions.create(\n", " model=model,\n", " messages=[{'role': 'user', 'content': prompt}],\n", " max_tokens=2000,\n", " temperature=0.25\n", " )\n", " answer = response.choices[0].message.content.strip()\n", " return answer\n", "\n", "# Example prompt\n", "# prompt = build_prompt(query, res) #\"Explain Warren Buffett's investment strategy focusing on companies with strong moats.\"\n", "\n", "# # Get the response\n", "# response = call_llm(prompt)\n", "# print(response)\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def rag_pipeline(query):\n", " \"\"\"\n", " Runs the Retrieval-Augmented Generation pipeline for a given query.\n", "\n", " Parameters:\n", " - query (str): The user's question or query.\n", " \n", " Returns:\n", " - The final answer from the LLM.\n", " \"\"\"\n", " search_results = hybrid_search(query)\n", " prompt = build_prompt(query, search_results)\n", "\n", " try:\n", " answer = call_llm(prompt)\n", " except Exception as e:\n", " answer = f\"An error occurred while generating the answer: {e}\"\n", "\n", " return answer\n", "\n", "\n", "# print (rag_pipeline(query))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def simulate_queries(questions_data):\n", " query_results = []\n", " for item in tqdm(questions_data):\n", " for question in item['questions']:\n", " true_chunk_id = item['chunk_id']\n", " # Retrieve top-N chunks using your RAG system\n", " retrieved_chunks = hybrid_search(question) # Implement this function\n", " retrieved_chunk_ids = [chunk['chunk_id'] for chunk in retrieved_chunks]\n", " query_results.append({\n", " 'question': question,\n", " 'true_chunk_id': true_chunk_id,\n", " 'retrieved_chunk_ids': retrieved_chunk_ids\n", " })\n", " return query_results\n", "\n", "def compute_hit_rate(query_results):\n", " hits = 0\n", " total = len(query_results)\n", " for result in tqdm(query_results):\n", " if result['true_chunk_id'] in result['retrieved_chunk_ids']:\n", " hits += 1\n", " hit_rate = hits / total\n", " return hit_rate\n", "\n", "def compute_mrr(query_results):\n", " rr_sum = 0\n", " total = len(query_results)\n", " for result in tqdm(query_results):\n", " try:\n", " rank = result['retrieved_chunk_ids'].index(result['true_chunk_id']) + 1\n", " rr_sum += 1 / rank\n", " except ValueError:\n", " continue # True chunk not in retrieved results\n", " mrr = rr_sum / total\n", " return mrr" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "668" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "questions_path = '../data/papers_questions_data.json'\n", "with open(questions_path, 'r') as json_file:\n", " questions_data = json.load(json_file)\n", "questions_data = [chunk for chunk in questions_data if chunk['source']=='paper']\n", "len(questions_data)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 668/668 [03:33<00:00, 3.12it/s]\n", "100%|██████████| 3340/3340 [00:00<00:00, 953186.05it/s]\n", "100%|██████████| 3340/3340 [00:00<00:00, 1978389.40it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "0.787125748502994 0.4930921490352624\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "query_results = simulate_queries(questions_data)\n", "print (compute_hit_rate(query_results), compute_mrr(query_results))\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "668" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chunk_content = {chnk['chunk_id']:chnk['content'] for chnk in all_data}\n", "len(chunk_content)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3340/3340 [09:24<00:00, 5.92it/s]\n" ] } ], "source": [ "import numpy as np\n", "\n", "def compute_relevancy_scores(query_results):\n", " relevancy_scores = []\n", " for result in tqdm(query_results):\n", " question_embedding = model.encode(result['question'])\n", " scores = []\n", " for chunk_id in result['retrieved_chunk_ids']:\n", " # Fetch the embedding of the chunk from PostgreSQL\n", " chunk_embedding = model.encode(chunk_content[chunk_id]).tolist() #get_chunk_embedding(chunk_id) # Implement this function\n", " similarity = np.dot(question_embedding, chunk_embedding) / (np.linalg.norm(question_embedding) * np.linalg.norm(chunk_embedding))\n", " scores.append(similarity)\n", " relevancy_scores.append({\n", " 'question': result['question'],\n", " 'true_chunk_id': result['true_chunk_id'],\n", " 'retrieved_chunk_ids': result['retrieved_chunk_ids'],\n", " 'similarity_scores': scores\n", " })\n", " return relevancy_scores\n", "\n", "relevancy_scores = compute_relevancy_scores(query_results)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "\n", "tmpDf = pd.DataFrame(relevancy_scores)[['true_chunk_id','similarity_scores']]\n", "tmpDf['score_avg'] = tmpDf.similarity_scores.apply(lambda x: float(np.median(x)))\n", "tmpDf.score_avg.hist()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 668/668 [00:00<00:00, 214237.27it/s]\n" ] } ], "source": [ "# Assume you have a list of evaluation data\n", "# Each item in evaluation_data is a dictionary with keys:\n", "# - 'question': The user question\n", "# - 'reference_answer': The expected correct answer (could be the chunk content)\n", "evaluation_data = []\n", "\n", "# Populate evaluation_data with your existing questions and reference answers\n", "for item in tqdm(questions_data):\n", " for question in item['questions']:\n", " # Use the original chunk content as the reference answer\n", " chunk_id = item['chunk_id']\n", " content = chunk_content[chunk_id] # Implement this function to fetch content\n", " evaluation_data.append({\n", " 'question': question,\n", " 'reference_answer': content\n", " })\n" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "def evaluate_rag_system(evaluation_data):\n", " for item in tqdm(evaluation_data):\n", " if 'rag_answer' in item.keys() :\n", " pass\n", " else:\n", " question = item['question']\n", " # Generate the RAG answer\n", " rag_answer = rag_pipeline(question)\n", " item['rag_answer'] = rag_answer\n", " return evaluation_data\n", "\n", "# Run the evaluation\n", "# evaluation_data = evaluate_rag_system(evaluation_data)\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def llm_judge(question, reference_answer, rag_answer, model='gpt-4o-mini'):\n", " prompt = f\"\"\"\n", "You are an expert evaluator. Compare the following two answers to the question provided. \n", "Assess the RAG-generated answer based on correctness, completeness, and relevance to the question, using the reference answer as the ground truth. \n", "Provide a score between 1 and 5, where 5 is the best; follow the below scoring guidelines for scoring.\n", "Scoring Guidelines:\n", "- 5: The RAG-generated answer is completely correct, fully addresses the question, and is as comprehensive as the reference answer.\n", "- 4: The answer is correct but may lack minor details compared to the reference.\n", "- 3: The answer is partially correct but misses significant details.\n", "- 2: The answer has some correct elements but contains notable inaccuracies.\n", "- 1: The answer is incorrect or irrelevant to the question.\n", "\n", "\n", "**Question:**\n", "{question}\n", "\n", "**Reference Answer:**\n", "{reference_answer}\n", "\n", "**RAG-Generated Answer:**\n", "{rag_answer}\n", "\n", "**Evaluation:**\n", "Score:\"\"\"\n", "\n", " response = client.chat.completions.create(\n", " model=model,\n", " messages=[{'role': 'user', 'content': prompt}],\n", " max_tokens=150,\n", " temperature=0,\n", " n=1,\n", " stop=None\n", " )\n", " evaluation_text = response.choices[0].message.content.strip()\n", " # Parse the score and reasoning\n", " score, reasoning = parse_evaluation(evaluation_text)\n", " return score, reasoning\n", "\n", "def parse_evaluation(evaluation_text):\n", " # Split the evaluation text to extract score and reasoning\n", " lines = evaluation_text.split('\\n')\n", " score = None\n", " reasoning = ''\n", " for line in lines:\n", " if line.strip().startswith('Score:'):\n", " score_line = line.strip()\n", " score_str = score_line.replace('Score:', '').strip()\n", " try:\n", " score = float(score_str)\n", " except ValueError:\n", " score = None\n", " elif line.strip().startswith('Reasoning:'):\n", " reasoning = line.strip().replace('Reasoning:', '').strip()\n", " else:\n", " if reasoning != '':\n", " reasoning += ' ' + line.strip()\n", " return score, reasoning\n" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "def run_llm_evaluation(evaluation_data):\n", " for item in tqdm(evaluation_data):\n", " question = item['question']\n", " reference_answer = item['reference_answer']\n", " rag_answer = item['rag_answer']\n", " try:\n", " score, reasoning = llm_judge(question, reference_answer, rag_answer)\n", " item['score'] = score\n", " item['reasoning'] = reasoning\n", " except Exception as e:\n", " print(f\"Error evaluating question '{question}': {e}\")\n", " item['score'] = None\n", " item['reasoning'] = str(e)\n", " return evaluation_data\n", "\n", "# Run LLM evaluation\n", "# evaluation_data = run_llm_evaluation(evaluation_data)\n", "\n", "\n", "def analyze_results(evaluation_data):\n", " valid_scores = [item['score'] for item in evaluation_data if item['score'] is not None]\n", " average_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0\n", " print(f\"Average Score: {average_score:.2f}\")\n", "\n", " # Optional: Analyze reasoning\n", " for item in evaluation_data:\n", " print(f\"Question: {item['question']}\")\n", " print(f\"Score: {item['score']}\")\n", " print(f\"Reasoning: {item['reasoning']}\")\n", " print('-' * 50)\n", "\n", "# Analyze the results\n", "# analyze_results(evaluation_data)\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3340/3340 [11:16<00:00, 4.94it/s] \n" ] } ], "source": [ "# Step 2: Run RAG pipeline\n", "evaluation_data = evaluate_rag_system(evaluation_data)\n" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "output_path = '../data/evaluation_data.json'\n", "with open(output_path, 'w') as json_file:\n", " json.dump(evaluation_data, json_file, indent=4)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3340/3340 [2:53:29<00:00, 3.12s/it] \n" ] } ], "source": [ "\n", "# Step 3: Use LLM as a judge\n", "evaluation_data = run_llm_evaluation(evaluation_data)\n" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "output_path = '../data/evaluation_data_fin.json'\n", "with open(output_path, 'w') as json_file:\n", " json.dump(evaluation_data, json_file, indent=4)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'analyze_results' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[1], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Step 4: Analyze results\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43manalyze_results\u001b[49m(evaluation_data)\n", "\u001b[0;31mNameError\u001b[0m: name 'analyze_results' is not defined" ] } ], "source": [ "\n", "# Step 4: Analyze results\n", "analyze_results(evaluation_data)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3340\n" ] }, { "data": { "text/plain": [ "{'question': 'What is the significance of detecting lead-lag relationships in stock returns?',\n", " 'reference_answer': 'DETECTING LEAD-LAGRELATIONSHIPS IN STOCK RETURNS AND PORTFOLIO STRATEGIES∗ Álvaro Cartea†‡Mihai Cucuringu∗§†¶Qi Jin∗‡∥ June 9, 2024 Click here for the most recent version ABSTRACT We propose a method to detect linear and nonlinear lead-lag relationships in stock returns. Our approach uses pairwise Lévy-area and cross-correlation of returns to rank the assets from leaders to followers. We use the rankings to construct a portfolio that longs or shorts the followers based on the previous returns of the leaders, and the stocks are ranked every time the portfolio is rebalanced. The portfolio also takes an offsetting position on the SPY ETF so that the initial value of the portfolio is zero. Our data spans from 1963 to 2022, and we use an average of over 500 stocks to construct portfolios for each trading day. The annualized returns of our lead-lag portfolios are over 20 %, and the returns outperform all lead-lag benchmarks in the literature. There is little overlap between the leaders and the followers we find and those that are reported in previous studies based on market capitalization, volume traded, and intra-industry relationships. Our findings support the slow information diffusion hypothesis; i.e., portfolios rebalanced once a day consistently outperform the bidiurnal, weekly, bi-weekly, tri-weekly, and monthly rebalanced portfolios. Keywords : Return prediction, Lead-lag relationships, Ranking, Lévy-area, Clustering JEL classification : G11, G12, G14, G17 ∗We thank Andrew Alden, Torben Andersen, Álvaro Arroyo, Patrick Chang, Fayçal Drissi, Anthony Ledford, Slavi Marinov, Sean Myers (discussant), Roberto Renò, and Harrison Waldon for helpful comments and feedback. We are grateful to audience at Man AHL, J.P. Morgan, GSA Capital, and Oxford Asset Management for comments. We are grateful to audience at the OMI Machine Learning and Financial Econometrics workshop and the Eastern Finance Association Annual Meeting for helpful comments. †Oxford-Man Institute of Quantitative Finance, University of Oxford ‡Mathematical Institute, University of Oxford §Department of Statistics, University of Oxford ¶The Alan Turing Institute, London, UK ∥Corresponding author; Email: qi.jin@st-annes.ox.ac.uk1 Introduction Changes in stock prices of some firms tend to follow those of other firms. This relationship between stock prices is often referred to as a lead-lag relationship. Detecting lead-lag relationships among a large set of stocks is not straightforward. The extant literature uses ad-hoc methods to select leaders and followers, and employs these two sets of stocks in investment strategies to evaluate the economic significance of the lead-lag relationship. For example, Lo and MacKinlay (1990) assume that large market capitalization stocks lead small market capitalization stocks. They build equal-weighted portfolios within each quantile of market capitalizations and use the cross-autocorrelation between the five portfolios to evaluate the trading performance of the lead-lag relationship. Empirical evidence suggests that firm size (Lo and MacKinlay (1990)), trading volume (Chordia and Swaminathan (2000)), institutional ownership (Badrinath et al. (1995)), and other firm characteristics contribute to the lead-lag identity of a stock. Empirically, however, many lead-lag relationships change over time and often cannot be explained by sorting stocks on a single firm characteristic.7This observation motivates that it is necessary to detect, instead of assume and then verify, lead-lag relationships. Our objective is to find lead-lag relationships without explicitly assuming a link between firm characteristics and lead-lag relationships; instead, we develop a data-driven method that employs stock returns to identify leaders and followers, and we show that the lead-lag relationships we find are economically significant. We achieve this in three steps. First, we design an algorithm that identifies the direction and strength of the lead-lag relationship between the returns of two stocks. Second, we propose a framework that uses state-of-the-art algorithms to rank stocks from leaders to followers based on the pairwise relationships. Third, we construct a zero-cost portfolio to assess the returns predictability of the leaders over the followers, and we measure the economic significance of the portfolio’s performance. Specifically, in the first step we design a method to score the lead-lag relationship between pairs of assets. The sign of the score indicates which of the two assets is more likely the leader, and the magnitude of the score quantifies the strength of the lead-lag relationship.',\n", " 'rag_answer': 'Detecting lead-lag relationships in stock returns is significant for several reasons:\\n\\n1. **Return Predictability**: Identifying which stocks lead and which follow allows investors to predict future returns based on past performance. This can enhance trading strategies by enabling investors to take positions in follower stocks based on the movements of leader stocks.\\n\\n2. **Portfolio Construction**: The ability to rank stocks from leaders to followers facilitates the construction of portfolios that can capitalize on these relationships. The study demonstrates that portfolios constructed using detected lead-lag relationships can achieve annualized returns exceeding 20%, significantly outperforming traditional benchmarks.\\n\\n3. **Economic Significance**: The findings support the slow information diffusion hypothesis, suggesting that some stocks react more slowly to common information. This insight can inform trading strategies and risk management practices, as it highlights the varying speeds at which different stocks respond to market events.\\n\\n4. **Robustness Across Time**: The lead-lag relationships identified are not static; they can change over time. This dynamic nature necessitates continuous monitoring and adjustment of trading strategies, which can lead to improved performance over time.\\n\\n5. **Nonlinear Relationships**: The methodology developed in the study captures both linear and nonlinear dependencies between stock returns, providing a more nuanced understanding of market behavior. This can lead to better risk-adjusted returns compared to methods that only consider linear relationships.\\n\\n6. **Market Efficiency Insights**: Understanding lead-lag relationships contributes to the broader discourse on market efficiency, particularly regarding how information is disseminated among market participants. It challenges traditional assumptions based on firm characteristics like market capitalization or trading volume.\\n\\nIn summary, detecting lead-lag relationships enhances trading strategies, informs portfolio construction, and provides insights into market dynamics and information flow, ultimately leading to better investment outcomes.',\n", " 'score': 4.0,\n", " 'reasoning': ''}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output_path = '../data/evaluation_data_fin.json'\n", "with open(output_path, 'r') as json_file:\n", " evaluation_data = json.load(json_file)\n", "\n", "print (len(evaluation_data))\n", "evaluation_data[0]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "score\n", "4.0 1404\n", "5.0 326\n", "1.0 59\n", "2.0 54\n", "3.0 50\n", "Name: count, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "pd.DataFrame(evaluation_data)['score'].hist()\n", "pd.DataFrame(evaluation_data)['score'].value_counts()" ] } ], "metadata": { "kernelspec": { "display_name": "buffett-wisdom-rag-yofaZaKX", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 2 }