In [1]:
import os
import re
import PyPDF2
import json
import ast

import nltk
# nltk.download('punkt')

from dotenv import load_dotenv
from openai import OpenAI
from tqdm import tqdm

# Load environment variables from the .envrc file
load_dotenv('../.envrc')

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [2]:
def generate_questions_prompt(chunk_text, source):
    if source == 'letter':
        context_info = "This text is an excerpt from one of Warren Buffett's annual shareholder letters, which often include investment philosophies, market insights, and business principles."
    elif source == 'report':
        context_info = "This text is an excerpt from a company's annual report, containing financial statements, management discussions, and business performance analysis."
    elif source == 'paper':
        context_info = "This text is an excerpt from a research paper about trading strategies and behavioural finance."
    else:
        context_info = ""
    
    prompt = f"""
You are an expert financial analyst and researcher specializing in trading strategies and behavioral finance.
Based on the following text, generate 5 relevant and insightful questions that a user might ask to better understand the content. 
The questions should be clear, concise, complete, not too short and should cover different aspects of the text. Use as fewer words as possible from the text. 

{context_info}

Text:
\"\"\"
{chunk_text}
\"\"\"

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
"""
    return prompt.strip()


In [3]:
def generate_questions(chunk_text, source):
    prompt = generate_questions_prompt(chunk_text, source)
    
    response = client.chat.completions.create(
        model='gpt-4o-mini',  # Specify the model
        messages=[
            # The conversation history, starting with the user's prompt
            {"role": "user", "content": prompt}
        ],
        max_tokens=250,        # Limit the response tokens
        temperature=0.5,       # Control the randomness
        n=1,                   # Number of responses to generate
        stop=None              # When to stop generating tokens
    )
    questions_text = response.choices[0].message.content#.strip()
    # # Split the questions into a list
    # questions = questions_text.split('\n')
    # # Clean up and ensure we have 5 questions
    # questions = [q.strip() for q in questions if q.strip()]
    # # Remove numbering if present
    # questions = [q[q.find('. ')+2:] if '. ' in q else q for q in questions]
    # return questions[:5]
    return questions_text


In [4]:
# letters_path = '../data/buffet_letters.json'
# with open(letters_path, 'r') as json_file:
#     processed_letters = json.load(json_file)

# report_path = '../data/annual_reports.json'
# with open(report_path, 'r') as json_file:
#     processed_reports = json.load(json_file)

papers_path = '../data/research_papers.json'
with open(papers_path, 'r') as json_file:
    processed_papers = json.load(json_file)


# for item in processed_letters:
#     metdt = item['metadata'].split('\n')
#     item['summary'] = metdt[0][len("summary:"):].strip()
#     item['key_topics'] = [i.strip() for i in metdt[1][len("key_topics:"):].strip().split(',')]
#     del item['metadata']
    
# for item in processed_reports:
    
#     metdt = item['metadata'].split('\n')

#     item['summary'] = metdt[0][len("summary:"):].strip()
#     item['key_topics'] = [i.strip() for i in metdt[1][len("key_topics:"):].strip().split(',')]
#     del item['metadata']

#     item['source'] = item['ticker'] + " annual " + item['source']
#     del item['ticker']

for item in processed_papers:
    
    metdt = item['metadata'].split('\n')

    item['summary'] = metdt[0][len("summary:"):].strip()
    item['key_topics'] = [i.strip() for i in metdt[1][len("key_topics:"):].strip().split(',')]
    del item['metadata']



all_data = processed_papers #processed_letters #+ processed_reports
len(all_data)

668

In [5]:
all_data[0]

{'source': 'paper',
 'chunk_id': 'ssrn-4599565_chunk_0',
 'content': 'DETECTING LEAD-LAGRELATIONSHIPS IN STOCK RETURNS AND PORTFOLIO STRATEGIES∗ Álvaro Cartea†‡Mihai Cucuringu∗§†¶Qi Jin∗‡∥ June 9, 2024 Click here for the most recent version ABSTRACT We propose a method to detect linear and nonlinear lead-lag relationships in stock returns. Our approach uses pairwise Lévy-area and cross-correlation of returns to rank the assets from leaders to followers. We use the rankings to construct a portfolio that longs or shorts the followers based on the previous returns of the leaders, and the stocks are ranked every time the portfolio is rebalanced. The portfolio also takes an offsetting position on the SPY ETF so that the initial value of the portfolio is zero. Our data spans from 1963 to 2022, and we use an average of over 500 stocks to construct portfolios for each trading day. The annualized returns of our lead-lag portfolios are over 20 %, and the returns outperform all lead-lag benchmark

In [6]:
tmpChunkList = []#[chnk['chunk_id'] for chnk in questions_data]
len(tmpChunkList)

0

In [7]:
questions_data = []

for item in tqdm(all_data):  # all_data contains your chunks with content
    if item['chunk_id'] not in tmpChunkList:
        chunk_text = item['content'] + "\n\n Summary: " + item['summary'] + "\n\n Key topics: " + ", ".join(item['key_topics'])
        chunk_id = item['chunk_id']
        source = item['source']  # 'letter' or 'report'
        try:
            questions = generate_questions(chunk_text, source)
            questions_data.append({
                'chunk_id': chunk_id,
                'source': source,
                'questions': ast.literal_eval(questions)['questions']
            })
        except Exception as e:
            print(f"Error generating questions for chunk {chunk_id}: {e}")
    else:
        pass

100%|██████████| 668/668 [22:36<00:00,  2.03s/it]


In [8]:
output_path = '../data/papers_questions_data.json'
with open(output_path, 'w') as json_file:
    json.dump(questions_data, json_file, indent=4)

In [None]:
output_path = '../data/letters_questions_data.json'
with open(output_path, 'w') as json_file:
    json.dump(questions_data, json_file, indent=4)

In [16]:
[chnk['chunk_id'] for chnk in questions_data if len(chnk['questions'])<5]

[]