In [1]:
!pip install --upgrade langchain
!pip install --upgrade langchain-openai
!pip install --upgrade langchain-core
!pip install --upgrade langsmith
!pip install ctransformers[cuda]
!pip install huggingface-hub
!pip install --upgrade sqlalchemy
!pip install rdflib
!pip install llama-cpp-python
!pip install typing-extensions==4.7.1 --upgrade
!pip install pypdf2
!pip install sentence-transformers
!pip install faiss-gpu

[0mCollecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jsonpatch<2.0,>=1.33
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting pydantic<3,>=1
  Downloading pydantic-2.7.1-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.3/409.3 kB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
Collecting langsmith<0.2.0,>=0.1.17
  Downloading langsmith-0.1.50-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.5/115.5 kB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.42
  Downloading langchain_core-0.1.45-py3-none-any.whl (291 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.3/291.3 kB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp<4.0.0,>=3.8.3
  Downloading aiohttp-

In [2]:
import transformers
import torch
from torch import cuda, bfloat16
import json

from PyPDF2 import PdfReader
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate

In [3]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
device = "cuda"

In [8]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config
)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [20]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_kwargs = {"device": "cpu"}
# tokenizer.pad_token = tokenizer.eos_token
embed_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
embed_model.client.tokenizer.pad_token = embed_model.client.tokenizer.eos_token

No sentence-transformers model found with name meta-llama/Meta-Llama-3-8B-Instruct. Creating a new one with MEAN pooling.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [37]:
embed_model_1 = HuggingFaceEmbeddings(model_name = "mixedbread-ai/mxbai-embed-large-v1")

In [38]:
def get_rag(paper_dir, embed_model):
    pdf = open(paper_dir, 'rb')
    pdf_reader = PdfReader(pdf)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100,
        length_function=len
    )
    chunks = text_splitter.split_text(text=text)
    return FAISS.from_texts(chunks, embedding=embed_model)

In [72]:
vector_store = get_rag("./paper/paper_3.pdf", embed_model)

In [73]:
SYS_PROMPT = """You are an assistant for extract information from the paper.
You are given the extracted parts of a paper about solar chemistry and a question. Provide the extracted information and nothing else.
"""

In [74]:
def search(query, k, embed_model):
    embed_q = embed_model.embed_query(query)
    return vector_store.similarity_search_with_score_by_vector(embed_q, k)

In [75]:
def format_prompt(prompt,retrieved_documents,k):
    PROMPT = f"Question:{prompt}\nContext:"
    for idx in range(k) :
        PROMPT+= f"{retrieved_documents[idx]}\n"
    return PROMPT

In [76]:
def generate(formatted_prompt):
    messages = [{"role":"system","content":SYS_PROMPT},{"role":"user","content":formatted_prompt}]
  # tell the model to generate
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)
    outputs = model.generate(
        input_ids,
        max_new_tokens=1024,
        eos_token_id=terminators,
        do_sample=True,
        # temperature=0.6,
        top_p=0.9,
    )
    response = outputs[0][input_ids.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

In [77]:
def rag_llama(prompt,k, embed_model):
    retrieved_documents = search(prompt, k, embed_model)
    formatted_prompt = format_prompt(prompt,retrieved_documents,k)
    return retrieved_documents, generate(formatted_prompt)

In [88]:
prompt_message = """Please Generate the Catalyst and co-catalyst from the paper, as well as generate the following information:Catalyst_set_up:['Powder(Suspended)', 'Film', 'Powder(Supported)', 'Powder', 'Other', 'Optical Fiber', 'SiO2 sublayer', 'Nanorribbons', 'Homogeneous', nan, 'Mesh', 'Monolith', 'Powder of nanocrystals', 'nanosheets', 'Nanofibers', 'ZnO nanorod array@carbon\nfiber (ZnO NA@CF) heterostructures', 'single-crystalline nanosheets', 'powder', 'Flaky g-C3N4 and uniformly loaded CdS nanospheres.', 'Nanocomposites.', 'powders', 'TiO2 nanosheets', 'Ultrathin SnNb2O6 2D nanosheets', 'nanotubes', 'tubes', 'nanorods', 'crystals', 'typical rod-like and belt-like structure', 'microrods', 'Hexagonal Platelets', 'Nickel on silica-alumina support', ' non-uniform grains', 'nanopowder', 'Single Crystal', 'Pellets', 'Powder(Sudpended)'], Reaction_medium:['Liquid', 'Gas'], Reactor_type: ['Slurry', 'Fixed-bed', 'OpticalFiber', 'Monolithic', 'Membrane', 'Fluidised-bed', nan], Operation_mode:['Batch', 'Continuous'], Lamp:['Mercury', 'Fluorescent', 'Xenon', 'SolarSimulator', 'Halogen', nan, 'Tungsten-Halide', 'Mercury-Xenon', 'Other', 'LED', 'Tungsten']
Please Only Selection the generation from the provided choices.
Please generating follow the format:
Catalyst: XXXX
Co-catalyst: XXXX
Catalyst_set_up: XXXX
Reaction_medium: XXXX
Reactor_type: XXXX
Operation_mode: XXXX
Lamp: XXXX
"""

In [95]:
doc, res = rag_llama(prompt_message, k = 5, embed_model = embed_model)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [101]:
res

'Catalyst: TiO2\nCo-catalyst: Silver\nCatalyst_set_up: Powder\nReaction_medium: Aqueous solution\nReactor_type: Fixed-bed\nOperation_mode: Batch\nLamp: Mercury'

In [99]:
result = {}

In [103]:
for i in range(10):
    idx = i+1
    paper_dir = "./paper/paper_"+ str(idx)+".pdf"
    vector_store = get_rag(paper_dir, embed_model)
    doc, temp = rag_llama(prompt_message, k = 3, embed_model = embed_model)
    result[str(idx)] =  {"location": doc, "extraction": {}}
    temp_list = temp.split("\n")
    for line in temp_list:
        print(line)
        if ":" in line:
            try:
                s, e = line.split(": ")
                result[str(idx)]["extraction"][s] = e
            except:
                pass

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Catalyst: TiO2
Co-catalyst: Platinum, Silver
Catalyst_set_up: Powder
Reaction_medium: Liquid
Reactor_type: Fixed-bed
Operation_mode: Batch
Lamp: SolarSimulator


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Here are the extracted information:

Catalyst: TiO2
Co-catalyst: None mentioned
Catalyst_set_up: Powder(Supported)
Reaction_medium: Gas
Reactor_type: Membrane
Operation_mode: Batch
Lamp: UV


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Based on the provided context, I extracted the following information:

Catalyst: TiO2
Co-catalyst: None
Catalyst_set_up: TiO2-coated optical fiber
Reaction_medium: Liquid
Reactor_type: Optical Fiber
Operation_mode: Batch
Lamp: None


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Here is the extracted information:

Catalyst: TiO2
Co-catalyst: None
Catalyst_set_up: Suspended
Reaction_medium: Aqueous solutions, Liquid
Reactor_type: Slurry, Fixed-bed
Operation_mode: Batch, Continuous
Lamp: Mercury, Xenon, Solar Simulator


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Based on the provided context, I extracted the following information:

Catalyst: Zirconium oxide
Co-catalyst: Not mentioned
Catalyst_set_up: Not mentioned
Reaction_medium: Liquid
Reactor_type: Not mentioned
Operation_mode: Not mentioned
Lamp: Not mentioned

Note that the information provided is limited to the given context, and further information may be needed to provide a complete answer.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Catalyst: TiO2
Co-catalyst: phosphoric acid, nitric acid, hydrochloric acid
Catalyst_set_up: Powder
Reaction_medium: Liquid, Gas
Reactor_type: Slurry, Fixed-bed
Operation_mode: Batch, Continuous
Lamp: nan


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Here is the extracted information:

Catalyst: Pt/TO-NP, Pt/TO-NT
Co-catalyst: None mentioned
Catalyst_set_up: Powder(Suspended), Powder(Supported)
Reaction_medium: Liquid
Reactor_type: Fixed-bed, Slurry
Operation_mode: Batch
Lamp: UV light lamp


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Based on the provided context, here is the extracted information:

Catalyst: CuO–TiO2
Co-catalyst: None mentioned
Catalyst_set_up: Powder(Suspended)
Reaction_medium: Liquid
Reactor_type: Slurry
Operation_mode: Batch
Lamp: None mentioned


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Catalyst: TiO2
Co-catalyst: ZrO2
Catalyst_set_up: Powder(Suspended)
Reaction_medium: Gas
Reactor_type: Monolithic
Operation_mode: Batch
Lamp: Near –UV or UV lamps


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Based on the provided context, here is the extracted information:

Catalyst: TiO2
Co-catalyst: Cu (0.5 wt%)/TiO2
Catalyst_set_up: Film, Nanosheets
Reaction_medium: Liquid
Reactor_type: Optical Fiber
Operation_mode: Continuous
Lamp: UVA


In [104]:
result

{'1': {'location': [(Document(page_content='2powder, which exhibits a great potential in improv-\ning the TiO 2photocatalytic activity.\nIliev et al. [37] modiﬁed the commercially available TiO 2catalyst\n(Degussa P25) with nanosized platinum and silver particles by thephotoreduction method to obtain a different noble metal loading(0.5 and 1 wt.%). The degradation of oxalic acid has been studiedin aqueous solution photocatalyzed by irradiated TiO\n2modiﬁed\nwith nanosized silver particles. The photocatalytic activity of TiO 2,'),
    4956.349),
   (Document(page_content='2pow-\nder samples by degradation of 1,4-dichlorobenzene (DCB). Silverwas deposited on TiO\n2powders (Degussa P25) by the photodecom-\nposition of two different silver salts, AgF and AgNO 3. They observed\nthat the photocatalytic activity of Ag-loaded TiO 2ﬁlms was greater\nthan that of pure TiO 2ﬁlms.\nSeery et al. [34] described the synthesis of TiO 2containing dif-'),
    5007.8975),
   (Document(page_content='0926-

In [115]:
for i in range(10):
    idx = i+1
    temp = []
    for doc in result[str(idx)]["location"]:
        temp.append(doc[0].page_content)
    result[str(idx)]["location"] = temp

In [116]:
result["1"]["location"]

['2powder, which exhibits a great potential in improv-\ning the TiO 2photocatalytic activity.\nIliev et al. [37] modiﬁed the commercially available TiO 2catalyst\n(Degussa P25) with nanosized platinum and silver particles by thephotoreduction method to obtain a different noble metal loading(0.5 and 1 wt.%). The degradation of oxalic acid has been studiedin aqueous solution photocatalyzed by irradiated TiO\n2modiﬁed\nwith nanosized silver particles. The photocatalytic activity of TiO 2,',
 '2pow-\nder samples by degradation of 1,4-dichlorobenzene (DCB). Silverwas deposited on TiO\n2powders (Degussa P25) by the photodecom-\nposition of two different silver salts, AgF and AgNO 3. They observed\nthat the photocatalytic activity of Ag-loaded TiO 2ﬁlms was greater\nthan that of pure TiO 2ﬁlms.\nSeery et al. [34] described the synthesis of TiO 2containing dif-',
 '0926-3373/$ – see front matter © 2010 Elsevier B.V. All rights reserved.\ndoi:10.1016/j.apcatb.2010.02.030240 K. Ko ˇcí et al. / A

In [117]:
import json
with open("result_llama3_8b.json", "w") as f:
    json.dump(result, f)