{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import PyPDF2\n",
    "import json\n",
    "import ast\n",
    "\n",
    "import nltk\n",
    "# nltk.download('punkt')\n",
    "\n",
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Load environment variables from the .envrc file\n",
    "load_dotenv('../.envrc')\n",
    "\n",
    "client = OpenAI(\n",
    "    api_key=os.getenv(\"OPENAI_API_KEY\"),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_questions_prompt(chunk_text, source):\n",
    "    if source == 'letter':\n",
    "        context_info = \"This text is an excerpt from one of Warren Buffett's annual shareholder letters, which often include investment philosophies, market insights, and business principles.\"\n",
    "    elif source == 'report':\n",
    "        context_info = \"This text is an excerpt from a company's annual report, containing financial statements, management discussions, and business performance analysis.\"\n",
    "    elif source == 'paper':\n",
    "        context_info = \"This text is an excerpt from a research paper about trading strategies and behavioural finance.\"\n",
    "    else:\n",
    "        context_info = \"\"\n",
    "    \n",
    "    prompt = f\"\"\"\n",
    "You are an expert financial analyst and researcher specializing in trading strategies and behavioral finance.\n",
    "Based on the following text, generate 5 relevant and insightful questions that a user might ask to better understand the content. \n",
    "The questions should be clear, concise, complete, not too short and should cover different aspects of the text. Use as fewer words as possible from the text. \n",
    "\n",
    "{context_info}\n",
    "\n",
    "Text:\n",
    "\\\"\\\"\\\"\n",
    "{chunk_text}\n",
    "\\\"\\\"\\\"\n",
    "\n",
    "Provide the output in parsable JSON without using code blocks:\n",
    "\n",
    "{{\"questions\": [\"question1\", \"question2\", ..., \"question5\"]}}\n",
    "\"\"\"\n",
    "    return prompt.strip()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_questions(chunk_text, source):\n",
    "    prompt = generate_questions_prompt(chunk_text, source)\n",
    "    \n",
    "    response = client.chat.completions.create(\n",
    "        model='gpt-4o-mini',  # Specify the model\n",
    "        messages=[\n",
    "            # The conversation history, starting with the user's prompt\n",
    "            {\"role\": \"user\", \"content\": prompt}\n",
    "        ],\n",
    "        max_tokens=250,        # Limit the response tokens\n",
    "        temperature=0.5,       # Control the randomness\n",
    "        n=1,                   # Number of responses to generate\n",
    "        stop=None              # When to stop generating tokens\n",
    "    )\n",
    "    questions_text = response.choices[0].message.content#.strip()\n",
    "    # # Split the questions into a list\n",
    "    # questions = questions_text.split('\\n')\n",
    "    # # Clean up and ensure we have 5 questions\n",
    "    # questions = [q.strip() for q in questions if q.strip()]\n",
    "    # # Remove numbering if present\n",
    "    # questions = [q[q.find('. ')+2:] if '. ' in q else q for q in questions]\n",
    "    # return questions[:5]\n",
    "    return questions_text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "668"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# letters_path = '../data/buffet_letters.json'\n",
    "# with open(letters_path, 'r') as json_file:\n",
    "#     processed_letters = json.load(json_file)\n",
    "\n",
    "# report_path = '../data/annual_reports.json'\n",
    "# with open(report_path, 'r') as json_file:\n",
    "#     processed_reports = json.load(json_file)\n",
    "\n",
    "papers_path = '../data/research_papers.json'\n",
    "with open(papers_path, 'r') as json_file:\n",
    "    processed_papers = json.load(json_file)\n",
    "\n",
    "\n",
    "# for item in processed_letters:\n",
    "#     metdt = item['metadata'].split('\\n')\n",
    "#     item['summary'] = metdt[0][len(\"summary:\"):].strip()\n",
    "#     item['key_topics'] = [i.strip() for i in metdt[1][len(\"key_topics:\"):].strip().split(',')]\n",
    "#     del item['metadata']\n",
    "    \n",
    "# for item in processed_reports:\n",
    "    \n",
    "#     metdt = item['metadata'].split('\\n')\n",
    "\n",
    "#     item['summary'] = metdt[0][len(\"summary:\"):].strip()\n",
    "#     item['key_topics'] = [i.strip() for i in metdt[1][len(\"key_topics:\"):].strip().split(',')]\n",
    "#     del item['metadata']\n",
    "\n",
    "#     item['source'] = item['ticker'] + \" annual \" + item['source']\n",
    "#     del item['ticker']\n",
    "\n",
    "for item in processed_papers:\n",
    "    \n",
    "    metdt = item['metadata'].split('\\n')\n",
    "\n",
    "    item['summary'] = metdt[0][len(\"summary:\"):].strip()\n",
    "    item['key_topics'] = [i.strip() for i in metdt[1][len(\"key_topics:\"):].strip().split(',')]\n",
    "    del item['metadata']\n",
    "\n",
    "\n",
    "\n",
    "all_data = processed_papers #processed_letters #+ processed_reports\n",
    "len(all_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'source': 'paper',\n",
       " 'chunk_id': 'ssrn-4599565_chunk_0',\n",
       " 'content': 'DETECTING LEAD-LAGRELATIONSHIPS IN STOCK RETURNS AND PORTFOLIO STRATEGIES∗ Álvaro Cartea†‡Mihai Cucuringu∗§†¶Qi Jin∗‡∥ June 9, 2024 Click here for the most recent version ABSTRACT We propose a method to detect linear and nonlinear lead-lag relationships in stock returns. Our approach uses pairwise Lévy-area and cross-correlation of returns to rank the assets from leaders to followers. We use the rankings to construct a portfolio that longs or shorts the followers based on the previous returns of the leaders, and the stocks are ranked every time the portfolio is rebalanced. The portfolio also takes an offsetting position on the SPY ETF so that the initial value of the portfolio is zero. Our data spans from 1963 to 2022, and we use an average of over 500 stocks to construct portfolios for each trading day. The annualized returns of our lead-lag portfolios are over 20 %, and the returns outperform all lead-lag benchmarks in the literature. There is little overlap between the leaders and the followers we find and those that are reported in previous studies based on market capitalization, volume traded, and intra-industry relationships. Our findings support the slow information diffusion hypothesis; i.e., portfolios rebalanced once a day consistently outperform the bidiurnal, weekly, bi-weekly, tri-weekly, and monthly rebalanced portfolios. Keywords : Return prediction, Lead-lag relationships, Ranking, Lévy-area, Clustering JEL classification : G11, G12, G14, G17 ∗We thank Andrew Alden, Torben Andersen, Álvaro Arroyo, Patrick Chang, Fayçal Drissi, Anthony Ledford, Slavi Marinov, Sean Myers (discussant), Roberto Renò, and Harrison Waldon for helpful comments and feedback. We are grateful to audience at Man AHL, J.P. Morgan, GSA Capital, and Oxford Asset Management for comments. We are grateful to audience at the OMI Machine Learning and Financial Econometrics workshop and the Eastern Finance Association Annual Meeting for helpful comments. †Oxford-Man Institute of Quantitative Finance, University of Oxford ‡Mathematical Institute, University of Oxford §Department of Statistics, University of Oxford ¶The Alan Turing Institute, London, UK ∥Corresponding author; Email: qi.jin@st-annes.ox.ac.uk1 Introduction Changes in stock prices of some firms tend to follow those of other firms. This relationship between stock prices is often referred to as a lead-lag relationship. Detecting lead-lag relationships among a large set of stocks is not straightforward. The extant literature uses ad-hoc methods to select leaders and followers, and employs these two sets of stocks in investment strategies to evaluate the economic significance of the lead-lag relationship. For example, Lo and MacKinlay (1990) assume that large market capitalization stocks lead small market capitalization stocks. They build equal-weighted portfolios within each quantile of market capitalizations and use the cross-autocorrelation between the five portfolios to evaluate the trading performance of the lead-lag relationship. Empirical evidence suggests that firm size (Lo and MacKinlay (1990)), trading volume (Chordia and Swaminathan (2000)), institutional ownership (Badrinath et al. (1995)), and other firm characteristics contribute to the lead-lag identity of a stock. Empirically, however, many lead-lag relationships change over time and often cannot be explained by sorting stocks on a single firm characteristic.7This observation motivates that it is necessary to detect, instead of assume and then verify, lead-lag relationships. Our objective is to find lead-lag relationships without explicitly assuming a link between firm characteristics and lead-lag relationships; instead, we develop a data-driven method that employs stock returns to identify leaders and followers, and we show that the lead-lag relationships we find are economically significant. We achieve this in three steps. First, we design an algorithm that identifies the direction and strength of the lead-lag relationship between the returns of two stocks. Second, we propose a framework that uses state-of-the-art algorithms to rank stocks from leaders to followers based on the pairwise relationships. Third, we construct a zero-cost portfolio to assess the returns predictability of the leaders over the followers, and we measure the economic significance of the portfolio’s performance. Specifically, in the first step we design a method to score the lead-lag relationship between pairs of assets. The sign of the score indicates which of the two assets is more likely the leader, and the magnitude of the score quantifies the strength of the lead-lag relationship.',\n",
       " 'summary': 'Method detects lead-lag relationships in stock returns for superior portfolio strategies.',\n",
       " 'key_topics': ['Lead-lag relationships',\n",
       "  'Stock returns',\n",
       "  'Portfolio strategies',\n",
       "  'Return prediction',\n",
       "  'Economic significance']}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tmpChunkList = []#[chnk['chunk_id'] for chnk in questions_data]\n",
    "len(tmpChunkList)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 668/668 [22:36<00:00,  2.03s/it]\n"
     ]
    }
   ],
   "source": [
    "questions_data = []\n",
    "\n",
    "for item in tqdm(all_data):  # all_data contains your chunks with content\n",
    "    if item['chunk_id'] not in tmpChunkList:\n",
    "        chunk_text = item['content'] + \"\\n\\n Summary: \" + item['summary'] + \"\\n\\n Key topics: \" + \", \".join(item['key_topics'])\n",
    "        chunk_id = item['chunk_id']\n",
    "        source = item['source']  # 'letter' or 'report'\n",
    "        try:\n",
    "            questions = generate_questions(chunk_text, source)\n",
    "            questions_data.append({\n",
    "                'chunk_id': chunk_id,\n",
    "                'source': source,\n",
    "                'questions': ast.literal_eval(questions)['questions']\n",
    "            })\n",
    "        except Exception as e:\n",
    "            print(f\"Error generating questions for chunk {chunk_id}: {e}\")\n",
    "    else:\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path = '../data/papers_questions_data.json'\n",
    "with open(output_path, 'w') as json_file:\n",
    "    json.dump(questions_data, json_file, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path = '../data/letters_questions_data.json'\n",
    "with open(output_path, 'w') as json_file:\n",
    "    json.dump(questions_data, json_file, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[chnk['chunk_id'] for chnk in questions_data if len(chnk['questions'])<5]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "buffett-wisdom-rag-yofaZaKX",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}