### Partition elements using Unstructured library 

In [None]:
# It may take longer to install the package
!pip install -qU \
    "unstructured[pdf]==0.12.4" \
    semantic-router

Start by downloading and processing an ArXiv paper.

In [1]:
from unstructured.partition.auto import partition

article_url = "https://arxiv.org/pdf/2402.05131.pdf"
elements = partition(url=article_url, strategy="hi_res", pdf_infer_table_structure=True)

  from .autonotebook import tqdm as notebook_tqdm
Conflict between variables skip_infer_table_types: ['pdf', 'jpg', 'png', 'xls', 'xlsx', 'heic'] and pdf_infer_table_structure: True, please reset skip_infer_table_types to turn on table extraction for PDFs.
This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initia

#### Define helper functions

Validate if parsed title element is a real title

In [2]:
import re


def is_valid_title(title: str) -> bool:
    # Rule 1: Title starts with a lowercase letter
    if re.match(r"^[a-z]", title):
        return False
    # Rule 2: Title has a special character (excluding :, -, and .)
    if re.search(r"[^\w\s:\-\.]", title):
        return False
    # Rule 3: Title ends with a dot
    if title.endswith("."):
        return False
    return True

Group elements by valid titles

In [3]:
from unstructured.documents.elements import Element
from colorama import Fore, Style
from typing import List, Dict


def group_elements_by_title(elements: List[Element]) -> Dict:
    grouped_elements = {}
    current_title = "Untitled"  # Default title for initial text without a title

    for element in elements:
        element_dict = element.to_dict()

        if element_dict.get("type") == "Title":
            potential_title = element_dict.get("text", "Untitled")
            if is_valid_title(potential_title):
                print(f"{Fore.GREEN}{potential_title}: True{Style.RESET_ALL}")
                current_title = potential_title
            else:
                print(f"{Fore.RED}{potential_title}: False{Style.RESET_ALL}")
                continue
        else:
            if current_title not in grouped_elements:
                grouped_elements[current_title] = []
            else:
                grouped_elements[current_title].append(element)
    return grouped_elements

Generates chunks grouped elements using semantic RollingWindow splitter

In [10]:
from semantic_router.splitters import RollingWindowSplitter
from typing import Dict


def create_title_chunks(
    grouped_elements: Dict, splitter: RollingWindowSplitter
) -> list:
    title_with_chunks = []
    for title, elements in grouped_elements.items():
        if not elements:
            continue
        combined_element_texts = []
        chunks = []

        for element in elements:
            if not element.text:
                continue
            element_dict = element.to_dict()
            if element_dict.get("type") == "Table":
                # Process accumulated text before the table
                if combined_element_texts:
                    splits = splitter(combined_element_texts)
                    print("-" * 80)
                    chunks.extend([split.content for split in splits])
                    combined_element_texts = []  # Reset combined texts after processing

                # Add table as a separate chunk
                table_text_html = element.metadata.text_as_html
                chunks.append(table_text_html)
            else:
                combined_element_texts.append(element.text)

        # Process any remaining accumulated text after the last table
        # or if no table was encountered

        if combined_element_texts:
            splits = splitter(combined_element_texts)
            print("-" * 80)
            chunks.extend([split.content for split in splits])

        if chunks:
            title_with_chunks.append({"title": title, "chunks": chunks})

    return title_with_chunks

Display chunked text in colors

In [11]:
from IPython.display import display, HTML
import itertools


def print_chunks_by_title(chunks_by_title):
    color_cycle = itertools.cycle(["red", "green", "blue", "magenta"])
    html_output = ""
    for section in chunks_by_title:
        title = section["title"]
        chunks = section["chunks"]
        html_output += f"<h3 style='color: black;'>{title}</h3>"
        for chunk in chunks:
            color = next(color_cycle)
            html_output += f"<p style='color: {color};'>{chunk}</p>"
    display(HTML(html_output))

### Process the elements

In [15]:
import os
from semantic_router.encoders import OpenAIEncoder

encoder = OpenAIEncoder(openai_api_key=os.environ["OPENAI_API_KEY"])

splitter = RollingWindowSplitter(
    encoder=encoder,
    window_size=1,  # Compares each element with the previous one
    min_split_tokens=1,
    max_split_tokens=500,
    plot_splits=False,
)

In [16]:
grouped_elements = group_elements_by_title(elements)

[31met! ee: False[0m
[31mb e F 0 1: False[0m
[31m] L C . s c [: False[0m
[32mFinancial Report Chunking for Eﬀective Retrieval Augmented Generation: True[0m
[32mIntroduction: True[0m
[31m2 Jimeno Yepes et al.: False[0m
[31m1 https://www.sec.gov 2 https://www.sec.gov/files/cf-frm.pdf: False[0m
[32m2 Related work: True[0m
[31m4 Jimeno Yepes et al.: False[0m
[32m3 Methods: True[0m
[32m3.1 RAG setting for the experiments: True[0m
[32m3.2 Indexing and retrieval: True[0m
[31m7 https://weaviate.io/developers/weaviate 8 https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-dot-: False[0m
[31mv1: False[0m
[32m3.3 Generation: True[0m
[31mQuestion: {query}: False[0m
[32m3.4 Chunking: True[0m
[32m3.5 Dataset: True[0m
[32m4 Results: True[0m
[31m11 https://platform.openai.com/docs/guides/embeddings/limitations-risks: False[0m
[31m10 Jimeno Yepes et al.: False[0m
[32m5 Discussion: True[0m
[31m12 Jimeno Yepes et al.: False[0m
[32m6 Conclusions a

In [17]:
chunks_by_title = create_title_chunks(grouped_elements, splitter)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
[32m2024-02-26 17:07:32 INFO semantic_router.utils.logger Optimal threshold 0.5 found with median tokens (27.0) in target range (1-500).[0m
[32m2024-02-26 17:07:32 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 1
  - Total Splits: 1
  - Splits by Threshold: 0
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 27
  - Maximum Token Size of Split: 27
  - Similarity Split Ratio: 0.00[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:33 INFO semantic_router.utils.logger Optimal threshold 0.7912974224053915 found with median tokens (136.5) in target range (1-500).[0m
[32m2024-02-26 17:07:33 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 3
  - Total Splits: 2
  - Splits by Threshold: 1
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 19
  - Maximum Token Size of Split: 254
  - Similarity Split Ratio: 0.50[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:33 INFO semantic_router.utils.logger Optimal threshold 0.8514465425347408 found with median tokens (129.5) in target range (1-500).[0m
[32m2024-02-26 17:07:33 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 7
  - Total Splits: 4
  - Splits by Threshold: 3
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 64
  - Maximum Token Size of Split: 400
  - Similarity Split Ratio: 0.75[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:34 INFO semantic_router.utils.logger Optimal threshold 0.8371609601655312 found with median tokens (154.0) in target range (1-500).[0m
[32m2024-02-26 17:07:34 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 7
  - Total Splits: 4
  - Splits by Threshold: 3
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 37
  - Maximum Token Size of Split: 362
  - Similarity Split Ratio: 0.75[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:34 INFO semantic_router.utils.logger Optimal threshold 0.8004127909380481 found with median tokens (46.0) in target range (1-500).[0m
[32m2024-02-26 17:07:34 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 5
  - Total Splits: 3
  - Splits by Threshold: 2
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 15
  - Maximum Token Size of Split: 161
  - Similarity Split Ratio: 0.67[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:35 INFO semantic_router.utils.logger Optimal threshold 0.7219220831968602 found with median tokens (94.0) in target range (1-500).[0m
[32m2024-02-26 17:07:35 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 6
  - Total Splits: 3
  - Splits by Threshold: 2
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 8
  - Maximum Token Size of Split: 100
  - Similarity Split Ratio: 0.67[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:35 INFO semantic_router.utils.logger Optimal threshold 0.7865543500746407 found with median tokens (92.5) in target range (1-500).[0m
[32m2024-02-26 17:07:35 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 4
  - Total Splits: 2
  - Splits by Threshold: 1
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 12
  - Maximum Token Size of Split: 173
  - Similarity Split Ratio: 0.50[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:36 INFO semantic_router.utils.logger Optimal threshold 0.7759885849518695 found with median tokens (73.0) in target range (1-500).[0m
[32m2024-02-26 17:07:36 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 9
  - Total Splits: 5
  - Splits by Threshold: 4
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 15
  - Maximum Token Size of Split: 210
  - Similarity Split Ratio: 0.80[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:36 INFO semantic_router.utils.logger Optimal threshold 0.7356350410401438 found with median tokens (23.0) in target range (1-500).[0m
[32m2024-02-26 17:07:36 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 5
  - Total Splits: 3
  - Splits by Threshold: 2
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 8
  - Maximum Token Size of Split: 198
  - Similarity Split Ratio: 0.67[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:37 INFO semantic_router.utils.logger Optimal threshold 0.7993056373716161 found with median tokens (14.0) in target range (1-500).[0m
[32m2024-02-26 17:07:37 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 5
  - Total Splits: 3
  - Splits by Threshold: 2
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 10
  - Maximum Token Size of Split: 95
  - Similarity Split Ratio: 0.67[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:37 INFO semantic_router.utils.logger Optimal threshold 0.7946781280578719 found with median tokens (104.5) in target range (1-500).[0m
[32m2024-02-26 17:07:37 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 4
  - Total Splits: 2
  - Splits by Threshold: 1
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 87
  - Maximum Token Size of Split: 122
  - Similarity Split Ratio: 0.50[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:38 INFO semantic_router.utils.logger Optimal threshold 0.7079124801171096 found with median tokens (15.0) in target range (1-500).[0m
[32m2024-02-26 17:07:38 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 2
  - Total Splits: 1
  - Splits by Threshold: 0
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 15
  - Maximum Token Size of Split: 15
  - Similarity Split Ratio: 0.00[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:38 INFO semantic_router.utils.logger Optimal threshold 0.8324466121743902 found with median tokens (110.5) in target range (1-500).[0m
[32m2024-02-26 17:07:38 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 12
  - Total Splits: 6
  - Splits by Threshold: 5
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 57
  - Maximum Token Size of Split: 254
  - Similarity Split Ratio: 0.83[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:39 INFO semantic_router.utils.logger Optimal threshold 0.8128022034342155 found with median tokens (16.5) in target range (1-500).[0m
[32m2024-02-26 17:07:39 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 3
  - Total Splits: 2
  - Splits by Threshold: 1
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 4
  - Maximum Token Size of Split: 29
  - Similarity Split Ratio: 0.50[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:39 INFO semantic_router.utils.logger Optimal threshold 0.786452236757286 found with median tokens (173.5) in target range (1-500).[0m
[32m2024-02-26 17:07:39 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 8
  - Total Splits: 4
  - Splits by Threshold: 3
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 8
  - Maximum Token Size of Split: 241
  - Similarity Split Ratio: 0.75[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:40 INFO semantic_router.utils.logger Optimal threshold 0.8250029487527775 found with median tokens (41.0) in target range (1-500).[0m
[32m2024-02-26 17:07:40 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 2
  - Total Splits: 1
  - Splits by Threshold: 0
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 41
  - Maximum Token Size of Split: 41
  - Similarity Split Ratio: 0.00[0m


--------------------------------------------------------------------------------


[32m2024-02-26 17:07:41 INFO semantic_router.utils.logger Optimal threshold 0.8086076732442027 found with median tokens (108.0) in target range (1-500).[0m
[32m2024-02-26 17:07:41 INFO semantic_router.utils.logger Splitting Statistics:
  - Total Documents: 45
  - Total Splits: 23
  - Splits by Threshold: 22
  - Splits by Max Chunk Size: 0
  - Last Split: 1
  - Minimum Token Size of Split: 4
  - Maximum Token Size of Split: 513
  - Similarity Split Ratio: 0.96[0m


--------------------------------------------------------------------------------


In [18]:
print_chunks_by_title(chunks_by_title)

Element Type,[Chipper Entities
NarrativeText,61780
Title,29664
ListItem,33054
UncategorizedText,9400
Footer,1026
Table,7700
Header,3959
Image,26
FigureCaption,54
Formula,29

Field,Value
,financebench-id|financebench.id_00859
doc_name,VERIZON.2021_10K
doc_link,https: //www.verizon.com/about/sites/default /files/2021-Annual- Report-on-Form-10-K.pdf
question_type,*novel-generated’
question,"Among all of the derivative instruments that Verizon used to manage] the exposure to fluctuations of foreign currencies exchange rates or interest rates, which one had the highest notional value in FY 2021?"
answer,"Cross currency swaps. Its notional value was $32,502 million.,"
evidence_text,"Derivative Instruments We enter into derivative transactions primarily to manage our exposure to fluctuations in foreign currency exchange rates and interest rates. We employ risk management strategies, which may include the use of a variety of derivatives including interest rate swaps, cross currency swaps, forward starting interest rate swaps, trea- sury rate locks, interest rate caps, swaptions and foreign exchange for- wards. We do not hold derivatives for trading purposes. The following table sets forth the notional amounts of our outstanding derivative in- struments: (dollars in millions) At December 31, 2021 2020 Interest rate swaps $ 19,779 $ 17,768 Cross currency swaps 32,502 26,288 Forward starting interest rate 1,000 2,000 Foreign exchange forwards 932"
page-number,

Processing|total,chunks|mean,chunks per document,(std)|tables mean (std)
Base 128,"| 64,058",800.73 (484.11),
Base 256,"| 32,051",400.64 (242.04) (,
Base 512,"| 16,046",200.58 (121. 01),
Chipper,20843,260.57 (145.80),96.20 (57.53)

Chunking strategy,Total Chunks},Page Accuracy,ROUGE|BLEU.
Base 128,64058,72.34,0.383
Base 256,32051,73.05,0.433
Base 512,16046,68.09,0.455
Base Aggregation,112155,83.69,0.536
Keywords Chipper,,46.1,0.444
Summary Chipper,,62.41,0.473
Prefix & Table Description Chipper,,67.38,0.514
Chipper Aggregation,a,84.4,0.568

Chunking strategy,No,Unnamed: 2,answer|GPT-4|Manual
Base 128,35.46,29.08,| 35.46
Base 256,5.5¢,32.62,| 36.88
Base 512,24.82,41.84,| 48.23
Keywords Chipper,22.70 |,43.97],53.19
Summary Chipper,17.73,|43.97]),51.77
Prefix & Table Description Chipper],20.57,41.13,| 53.19
