{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "4e082724-c77a-4188-889a-fb5eb028d298", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"data\": [\n", " {\n", " \"scale_settings\": {\n", " \"scale_type\": \"standard\"\n", " },\n", " \"model\": \"text-davinci-002\",\n", " \"owner\": \"organization-owner\",\n", " \"id\": \"text-davinci-002\",\n", " \"status\": \"succeeded\",\n", " \"created_at\": 1657572678,\n", " \"updated_at\": 1657572678,\n", " \"object\": \"deployment\"\n", " },\n", " {\n", " \"scale_settings\": {\n", " \"scale_type\": \"standard\"\n", " },\n", " \"model\": \"code-cushman-001\",\n", " \"owner\": \"organization-owner\",\n", " \"id\": \"code-cushman-001\",\n", " \"status\": \"succeeded\",\n", " \"created_at\": 1657572712,\n", " \"updated_at\": 1657572712,\n", " \"object\": \"deployment\"\n", " },\n", " {\n", " \"scale_settings\": {\n", " \"scale_type\": \"standard\"\n", " },\n", " \"model\": \"text-search-curie-doc-001\",\n", " \"owner\": \"organization-owner\",\n", " \"id\": \"text-search-curie-doc-001\",\n", " \"status\": \"succeeded\",\n", " \"created_at\": 1668620345,\n", " \"updated_at\": 1668620345,\n", " \"object\": \"deployment\"\n", " },\n", " {\n", " \"scale_settings\": {\n", " \"scale_type\": \"standard\"\n", " },\n", " \"model\": \"text-search-curie-query-001\",\n", " \"owner\": \"organization-owner\",\n", " \"id\": \"text-search-curie-query-001\",\n", " \"status\": \"succeeded\",\n", " \"created_at\": 1669048765,\n", " \"updated_at\": 1669048765,\n", " \"object\": \"deployment\"\n", " }\n", " ],\n", " \"object\": \"list\"\n", "}\n" ] } ], "source": [ "import openai\n", "import re\n", "import requests\n", "import sys\n", "from num2words import num2words\n", "import os\n", "import pandas as pd\n", "import numpy as np\n", "from openai.embeddings_utils import get_embedding, cosine_similarity\n", "from transformers import GPT2TokenizerFast\n", "\n", "API_KEY = os.getenv(\"AZURE_OPENAI_API_KEY\") \n", "RESOURCE_ENDPOINT = os.getenv(\"AZURE_OPENAI_ENDPOINT\") \n", "\n", "openai.api_type = \"azure\"\n", "openai.api_key = API_KEY\n", "openai.api_base = RESOURCE_ENDPOINT\n", "openai.api_version = \"2022-12-01\"\n", "\n", "url = openai.api_base + \"/openai/deployments?api-version=2022-12-01\"\n", "\n", "r = requests.get(url, headers={\"api-key\": API_KEY})\n", "\n", "print(r.text)" ] }, { "cell_type": "code", "execution_count": 4, "id": "88f6fb03-6f92-41b0-9491-a19f887012b4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0bill_idtextsummarytitletext_lensum_len
00110_hr37SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...National Science Education Tax Incentive for B...To amend the Internal Revenue Code of 1986 to ...8494321
11112_hr2873SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Small Business Expansion and Hiring Act of 201...To amend the Internal Revenue Code of 1986 to ...65221424
22109_s2408SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...Requires the Director of National Intelligence...A bill to require the Director of National Int...6154463
33108_s1899SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...National Cancer Act of 2003 - Amends the Publi...A bill to improve data collection and dissemin...198531400
44107_s1531SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Military Call-up Relief Act - Amends the Inter...A bill to amend the Internal Revenue Code of 1...6273278
55107_hr4541SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...Requires the Customs Service to reliquidate ce...To provide for reliquidation of entries premat...11691114
66111_s1495SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Service Dogs for Veterans Act of 2009 - Direct...A bill to require the Secretary of Veterans Af...5328379
77111_s3885SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Race to the Top Act of 2010 - Directs the Secr...A bill to provide incentives for States and lo...166681525
88113_hr1796SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Troop Talent Act of 2013 - Directs the Secreta...Troop Talent Act of 2013153522151
99103_hr1987SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Taxpayer's Right to View Act of 1993 - Amends ...Taxpayer's Right to View Act of 19935633894
1010103_hr1677SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Full-Service Schools Act - Establishes the Fed...Full-Service Schools Act124721107
1111111_s3149SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Wall Street Compensation Reform Act of 2010 - ...A bill to amend the Internal Revenue Code of 1...182261297
1212110_hr1007SECTION 1. FINDINGS.\\r\\n\\r\\n The Congress f...Amends the Marine Mammal Protection Act of 197...To amend the Marine Mammal Protection Act of 1...5261276
1313113_hr3137SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Freedom and Mobility in Consumer Banking Act -...Freedom and Mobility in Consumer Banking Act176902044
1414115_hr1634SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Education and Training for Health Act of 2017 ...Education and Training for Health Act of 20179037772
1515103_hr1815SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Recreational Hunting Safety and Preservation A...Recreational Hunting Safety and Preservation A...13024475
1616113_s1773SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Andrew Prior Act or Andrew's Law - Amends the ...Andrew's Law5149613
1717106_hr5585SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Directs the President, in coordination with de...Energy Independence Act of 20008007810
1818114_hr2499SECTION 1. SHORT TITLE.\\r\\n This Act may be...This measure has not been amended since it was...Veterans Entrepreneurship Act of 201575391421
1919111_hr3141SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Strengthening the Health Care Safety Net Act o...To amend title XIX of the Social Security Act ...18429514
\n", "
" ], "text/plain": [ " Unnamed: 0 bill_id text \\\n", "0 0 110_hr37 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "1 1 112_hr2873 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "2 2 109_s2408 SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR... \n", "3 3 108_s1899 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "4 4 107_s1531 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "5 5 107_hr4541 SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR... \n", "6 6 111_s1495 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "7 7 111_s3885 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "8 8 113_hr1796 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "9 9 103_hr1987 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "10 10 103_hr1677 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "11 11 111_s3149 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "12 12 110_hr1007 SECTION 1. FINDINGS.\\r\\n\\r\\n The Congress f... \n", "13 13 113_hr3137 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "14 14 115_hr1634 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "15 15 103_hr1815 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "16 16 113_s1773 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "17 17 106_hr5585 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "18 18 114_hr2499 SECTION 1. SHORT TITLE.\\r\\n This Act may be... \n", "19 19 111_hr3141 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "\n", " summary \\\n", "0 National Science Education Tax Incentive for B... \n", "1 Small Business Expansion and Hiring Act of 201... \n", "2 Requires the Director of National Intelligence... \n", "3 National Cancer Act of 2003 - Amends the Publi... \n", "4 Military Call-up Relief Act - Amends the Inter... \n", "5 Requires the Customs Service to reliquidate ce... \n", "6 Service Dogs for Veterans Act of 2009 - Direct... \n", "7 Race to the Top Act of 2010 - Directs the Secr... \n", "8 Troop Talent Act of 2013 - Directs the Secreta... \n", "9 Taxpayer's Right to View Act of 1993 - Amends ... \n", "10 Full-Service Schools Act - Establishes the Fed... \n", "11 Wall Street Compensation Reform Act of 2010 - ... \n", "12 Amends the Marine Mammal Protection Act of 197... \n", "13 Freedom and Mobility in Consumer Banking Act -... \n", "14 Education and Training for Health Act of 2017 ... \n", "15 Recreational Hunting Safety and Preservation A... \n", "16 Andrew Prior Act or Andrew's Law - Amends the ... \n", "17 Directs the President, in coordination with de... \n", "18 This measure has not been amended since it was... \n", "19 Strengthening the Health Care Safety Net Act o... \n", "\n", " title text_len sum_len \n", "0 To amend the Internal Revenue Code of 1986 to ... 8494 321 \n", "1 To amend the Internal Revenue Code of 1986 to ... 6522 1424 \n", "2 A bill to require the Director of National Int... 6154 463 \n", "3 A bill to improve data collection and dissemin... 19853 1400 \n", "4 A bill to amend the Internal Revenue Code of 1... 6273 278 \n", "5 To provide for reliquidation of entries premat... 11691 114 \n", "6 A bill to require the Secretary of Veterans Af... 5328 379 \n", "7 A bill to provide incentives for States and lo... 16668 1525 \n", "8 Troop Talent Act of 2013 15352 2151 \n", "9 Taxpayer's Right to View Act of 1993 5633 894 \n", "10 Full-Service Schools Act 12472 1107 \n", "11 A bill to amend the Internal Revenue Code of 1... 18226 1297 \n", "12 To amend the Marine Mammal Protection Act of 1... 5261 276 \n", "13 Freedom and Mobility in Consumer Banking Act 17690 2044 \n", "14 Education and Training for Health Act of 2017 9037 772 \n", "15 Recreational Hunting Safety and Preservation A... 13024 475 \n", "16 Andrew's Law 5149 613 \n", "17 Energy Independence Act of 2000 8007 810 \n", "18 Veterans Entrepreneurship Act of 2015 7539 1421 \n", "19 To amend title XIX of the Social Security Act ... 18429 514 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\"c:\\\\bill_sum_data.csv\") # example: df = pd.read_csv(\"c:\\\\test\\\\bill_sum_data.csv\")df\n", "df" ] }, { "cell_type": "code", "execution_count": 5, "id": "9ee11565-f6c4-450b-9fc4-e42249ae2511", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsummarytitle
0SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...National Science Education Tax Incentive for B...To amend the Internal Revenue Code of 1986 to ...
1SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Small Business Expansion and Hiring Act of 201...To amend the Internal Revenue Code of 1986 to ...
2SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...Requires the Director of National Intelligence...A bill to require the Director of National Int...
3SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...National Cancer Act of 2003 - Amends the Publi...A bill to improve data collection and dissemin...
4SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Military Call-up Relief Act - Amends the Inter...A bill to amend the Internal Revenue Code of 1...
5SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...Requires the Customs Service to reliquidate ce...To provide for reliquidation of entries premat...
6SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Service Dogs for Veterans Act of 2009 - Direct...A bill to require the Secretary of Veterans Af...
7SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Race to the Top Act of 2010 - Directs the Secr...A bill to provide incentives for States and lo...
8SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Troop Talent Act of 2013 - Directs the Secreta...Troop Talent Act of 2013
9SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Taxpayer's Right to View Act of 1993 - Amends ...Taxpayer's Right to View Act of 1993
10SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Full-Service Schools Act - Establishes the Fed...Full-Service Schools Act
11SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Wall Street Compensation Reform Act of 2010 - ...A bill to amend the Internal Revenue Code of 1...
12SECTION 1. FINDINGS.\\r\\n\\r\\n The Congress f...Amends the Marine Mammal Protection Act of 197...To amend the Marine Mammal Protection Act of 1...
13SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Freedom and Mobility in Consumer Banking Act -...Freedom and Mobility in Consumer Banking Act
14SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Education and Training for Health Act of 2017 ...Education and Training for Health Act of 2017
15SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Recreational Hunting Safety and Preservation A...Recreational Hunting Safety and Preservation A...
16SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Andrew Prior Act or Andrew's Law - Amends the ...Andrew's Law
17SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Directs the President, in coordination with de...Energy Independence Act of 2000
18SECTION 1. SHORT TITLE.\\r\\n This Act may be...This measure has not been amended since it was...Veterans Entrepreneurship Act of 2015
19SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma...Strengthening the Health Care Safety Net Act o...To amend title XIX of the Social Security Act ...
\n", "
" ], "text/plain": [ " text \\\n", "0 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "1 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "2 SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR... \n", "3 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "4 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "5 SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR... \n", "6 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "7 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "8 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "9 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "10 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "11 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "12 SECTION 1. FINDINGS.\\r\\n\\r\\n The Congress f... \n", "13 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "14 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "15 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "16 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "17 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "18 SECTION 1. SHORT TITLE.\\r\\n This Act may be... \n", "19 SECTION 1. SHORT TITLE.\\r\\n\\r\\n This Act ma... \n", "\n", " summary \\\n", "0 National Science Education Tax Incentive for B... \n", "1 Small Business Expansion and Hiring Act of 201... \n", "2 Requires the Director of National Intelligence... \n", "3 National Cancer Act of 2003 - Amends the Publi... \n", "4 Military Call-up Relief Act - Amends the Inter... \n", "5 Requires the Customs Service to reliquidate ce... \n", "6 Service Dogs for Veterans Act of 2009 - Direct... \n", "7 Race to the Top Act of 2010 - Directs the Secr... \n", "8 Troop Talent Act of 2013 - Directs the Secreta... \n", "9 Taxpayer's Right to View Act of 1993 - Amends ... \n", "10 Full-Service Schools Act - Establishes the Fed... \n", "11 Wall Street Compensation Reform Act of 2010 - ... \n", "12 Amends the Marine Mammal Protection Act of 197... \n", "13 Freedom and Mobility in Consumer Banking Act -... \n", "14 Education and Training for Health Act of 2017 ... \n", "15 Recreational Hunting Safety and Preservation A... \n", "16 Andrew Prior Act or Andrew's Law - Amends the ... \n", "17 Directs the President, in coordination with de... \n", "18 This measure has not been amended since it was... \n", "19 Strengthening the Health Care Safety Net Act o... \n", "\n", " title \n", "0 To amend the Internal Revenue Code of 1986 to ... \n", "1 To amend the Internal Revenue Code of 1986 to ... \n", "2 A bill to require the Director of National Int... \n", "3 A bill to improve data collection and dissemin... \n", "4 A bill to amend the Internal Revenue Code of 1... \n", "5 To provide for reliquidation of entries premat... \n", "6 A bill to require the Secretary of Veterans Af... \n", "7 A bill to provide incentives for States and lo... \n", "8 Troop Talent Act of 2013 \n", "9 Taxpayer's Right to View Act of 1993 \n", "10 Full-Service Schools Act \n", "11 A bill to amend the Internal Revenue Code of 1... \n", "12 To amend the Marine Mammal Protection Act of 1... \n", "13 Freedom and Mobility in Consumer Banking Act \n", "14 Education and Training for Health Act of 2017 \n", "15 Recreational Hunting Safety and Preservation A... \n", "16 Andrew's Law \n", "17 Energy Independence Act of 2000 \n", "18 Veterans Entrepreneurship Act of 2015 \n", "19 To amend title XIX of the Social Security Act ... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_bills = df[['text', 'summary', 'title']]\n", "df_bills" ] }, { "cell_type": "code", "execution_count": 6, "id": "5c055a39-29f9-4c32-8820-27284eed0bc7", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\contoso\\AppData\\Local\\Temp\\ipykernel_50576\\3185142679.py:13: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_bills['text'] = df_bills[\"text\"].apply(lambda x : normalize_text(x))\n" ] } ], "source": [ "# s is input text\n", "def normalize_text(s, sep_token = \" \\n \"):\n", " s = re.sub(r'\\s+', ' ', s).strip()\n", " s = re.sub(r\". ,\",\"\",s)\n", " # remove all instances of multiple spaces\n", " s = s.replace(\"..\",\".\")\n", " s = s.replace(\". .\",\".\")\n", " s = s.replace(\"\\n\", \"\")\n", " s = s.strip()\n", " \n", " return s\n", "\n", "df_bills['text'] = df_bills[\"text\"].apply(lambda x : normalize_text(x))" ] }, { "cell_type": "code", "execution_count": 7, "id": "5ffcb5fa-60f6-416b-a1e7-87bd92685511", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Token indices sequence length is longer than the specified maximum sequence length for this model (1480 > 1024). Running this sequence through the model will result in indexing errors\n", "C:\\Users\\contoso\\AppData\\Local\\Temp\\ipykernel_50576\\1732209292.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_bills['n_tokens'] = df_bills[\"text\"].apply(lambda x: len(tokenizer.encode(x)))\n" ] }, { "data": { "text/plain": [ "12" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n", "df_bills['n_tokens'] = df_bills[\"text\"].apply(lambda x: len(tokenizer.encode(x)))\n", "df_bills = df_bills[df_bills.n_tokens<2000]\n", "len(df_bills)" ] }, { "cell_type": "code", "execution_count": 8, "id": "6b55d1f6-c8d4-45ad-bef8-ffc41ff67d43", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsummarytitlen_tokens
0SECTION 1. SHORT TITLE. This Act may be cited ...National Science Education Tax Incentive for B...To amend the Internal Revenue Code of 1986 to ...1480
1SECTION 1. SHORT TITLE. This Act may be cited ...Small Business Expansion and Hiring Act of 201...To amend the Internal Revenue Code of 1986 to ...1152
2SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...Requires the Director of National Intelligence...A bill to require the Director of National Int...930
4SECTION 1. SHORT TITLE. This Act may be cited ...Military Call-up Relief Act - Amends the Inter...A bill to amend the Internal Revenue Code of 1...1048
5SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...Requires the Customs Service to reliquidate ce...To provide for reliquidation of entries premat...1846
6SECTION 1. SHORT TITLE. This Act may be cited ...Service Dogs for Veterans Act of 2009 - Direct...A bill to require the Secretary of Veterans Af...872
9SECTION 1. SHORT TITLE. This Act may be cited ...Taxpayer's Right to View Act of 1993 - Amends ...Taxpayer's Right to View Act of 1993946
12SECTION 1. FINDINGS. The Congress finds the fo...Amends the Marine Mammal Protection Act of 197...To amend the Marine Mammal Protection Act of 1...1223
14SECTION 1. SHORT TITLE. This Act may be cited ...Education and Training for Health Act of 2017 ...Education and Training for Health Act of 20171596
16SECTION 1. SHORT TITLE. This Act may be cited ...Andrew Prior Act or Andrew's Law - Amends the ...Andrew's Law608
17SECTION 1. SHORT TITLE. This Act may be cited ...Directs the President, in coordination with de...Energy Independence Act of 20001341
18SECTION 1. SHORT TITLE. This Act may be cited ...This measure has not been amended since it was...Veterans Entrepreneurship Act of 20151404
\n", "
" ], "text/plain": [ " text \\\n", "0 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "1 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "2 SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR... \n", "4 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "5 SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR... \n", "6 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "9 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "12 SECTION 1. FINDINGS. The Congress finds the fo... \n", "14 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "16 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "17 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "18 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "\n", " summary \\\n", "0 National Science Education Tax Incentive for B... \n", "1 Small Business Expansion and Hiring Act of 201... \n", "2 Requires the Director of National Intelligence... \n", "4 Military Call-up Relief Act - Amends the Inter... \n", "5 Requires the Customs Service to reliquidate ce... \n", "6 Service Dogs for Veterans Act of 2009 - Direct... \n", "9 Taxpayer's Right to View Act of 1993 - Amends ... \n", "12 Amends the Marine Mammal Protection Act of 197... \n", "14 Education and Training for Health Act of 2017 ... \n", "16 Andrew Prior Act or Andrew's Law - Amends the ... \n", "17 Directs the President, in coordination with de... \n", "18 This measure has not been amended since it was... \n", "\n", " title n_tokens \n", "0 To amend the Internal Revenue Code of 1986 to ... 1480 \n", "1 To amend the Internal Revenue Code of 1986 to ... 1152 \n", "2 A bill to require the Director of National Int... 930 \n", "4 A bill to amend the Internal Revenue Code of 1... 1048 \n", "5 To provide for reliquidation of entries premat... 1846 \n", "6 A bill to require the Secretary of Veterans Af... 872 \n", "9 Taxpayer's Right to View Act of 1993 946 \n", "12 To amend the Marine Mammal Protection Act of 1... 1223 \n", "14 Education and Training for Health Act of 2017 1596 \n", "16 Andrew's Law 608 \n", "17 Energy Independence Act of 2000 1341 \n", "18 Veterans Entrepreneurship Act of 2015 1404 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_bills" ] }, { "cell_type": "code", "execution_count": 9, "id": "18352c6e-6891-4f68-a8f6-9a17f8fdcc0f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['S',\n", " 'ECTION',\n", " 'Ġ1',\n", " '.',\n", " 'ĠSH',\n", " 'ORT',\n", " 'ĠTIT',\n", " 'LE',\n", " '.',\n", " 'ĠThis',\n", " 'ĠAct',\n", " 'Ġmay',\n", " 'Ġbe',\n", " 'Ġcited',\n", " 'Ġas',\n", " 'Ġthe',\n", " 'Ġ``',\n", " 'National',\n", " 'ĠScience',\n", " 'ĠEducation',\n", " 'ĠTax',\n", " 'ĠIn',\n", " 'cent',\n", " 'ive',\n", " 'Ġfor',\n", " 'ĠBusiness',\n", " 'es',\n", " 'ĠAct',\n", " 'Ġof',\n", " 'Ġ2007',\n", " \"''.\",\n", " 'ĠSEC',\n", " '.',\n", " 'Ġ2',\n", " '.',\n", " 'ĠCR',\n", " 'EDIT',\n", " 'S',\n", " 'ĠFOR',\n", " 'ĠC',\n", " 'ER',\n", " 'TAIN',\n", " 'ĠCONTR',\n", " 'IB',\n", " 'UT',\n", " 'IONS',\n", " 'ĠBEN',\n", " 'EF',\n", " 'IT',\n", " 'ING',\n", " 'ĠSC',\n", " 'IENCE',\n", " ',',\n", " 'ĠTECH',\n", " 'N',\n", " 'OLOGY',\n", " ',',\n", " 'ĠENG',\n", " 'INE',\n", " 'ER',\n", " 'ING',\n", " ',',\n", " 'ĠAND',\n", " 'ĠM',\n", " 'ATH',\n", " 'EM',\n", " 'AT',\n", " 'ICS',\n", " 'ĠED',\n", " 'UC',\n", " 'ATION',\n", " 'ĠAT',\n", " 'ĠTHE',\n", " 'ĠELE',\n", " 'MENT',\n", " 'ARY',\n", " 'ĠAND',\n", " 'ĠSEC',\n", " 'OND',\n", " 'ARY',\n", " 'ĠSCHOOL',\n", " 'ĠLEVEL',\n", " '.',\n", " 'Ġ(',\n", " 'a',\n", " ')',\n", " 'ĠIn',\n", " 'ĠGeneral',\n", " '.--',\n", " 'Sub',\n", " 'part',\n", " 'ĠD',\n", " 'Ġof',\n", " 'Ġpart',\n", " 'ĠIV',\n", " 'Ġof',\n", " 'Ġsub',\n", " 'chapter',\n", " 'ĠA',\n", " 'Ġof',\n", " 'Ġchapter',\n", " 'Ġ1',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'ĠInternal',\n", " 'ĠRevenue',\n", " 'ĠCode',\n", " 'Ġof',\n", " 'Ġ1986',\n", " 'Ġ(',\n", " 'rel',\n", " 'ating',\n", " 'Ġto',\n", " 'Ġbusiness',\n", " 'Ġrelated',\n", " 'Ġcredits',\n", " ')',\n", " 'Ġis',\n", " 'Ġamended',\n", " 'Ġby',\n", " 'Ġadding',\n", " 'Ġat',\n", " 'Ġthe',\n", " 'Ġend',\n", " 'Ġthe',\n", " 'Ġfollowing',\n", " 'Ġnew',\n", " 'Ġsection',\n", " ':',\n", " 'Ġ``',\n", " 'SEC',\n", " '.',\n", " 'Ġ45',\n", " 'O',\n", " '.',\n", " 'ĠCONTR',\n", " 'IB',\n", " 'UT',\n", " 'IONS',\n", " 'ĠBEN',\n", " 'EF',\n", " 'IT',\n", " 'ING',\n", " 'ĠSC',\n", " 'IENCE',\n", " ',',\n", " 'ĠTECH',\n", " 'N',\n", " 'OLOGY',\n", " ',',\n", " 'ĠENG',\n", " 'INE',\n", " 'ER',\n", " 'ING',\n", " ',',\n", " 'ĠAND',\n", " 'ĠM',\n", " 'ATH',\n", " 'EM',\n", " 'AT',\n", " 'ICS',\n", " 'ĠED',\n", " 'UC',\n", " 'ATION',\n", " 'ĠAT',\n", " 'ĠTHE',\n", " 'ĠELE',\n", " 'MENT',\n", " 'ARY',\n", " 'ĠAND',\n", " 'ĠSEC',\n", " 'OND',\n", " 'ARY',\n", " 'ĠSCHOOL',\n", " 'ĠLEVEL',\n", " '.',\n", " 'Ġ``(',\n", " 'a',\n", " ')',\n", " 'ĠIn',\n", " 'ĠGeneral',\n", " '.--',\n", " 'For',\n", " 'Ġpurposes',\n", " 'Ġof',\n", " 'Ġsection',\n", " 'Ġ38',\n", " ',',\n", " 'Ġthe',\n", " 'Ġelementary',\n", " 'Ġand',\n", " 'Ġsecondary',\n", " 'Ġscience',\n", " ',',\n", " 'Ġtechnology',\n", " ',',\n", " 'Ġengineering',\n", " ',',\n", " 'Ġand',\n", " 'Ġmathematics',\n", " 'Ġ(',\n", " 'STEM',\n", " ')',\n", " 'Ġcontributions',\n", " 'Ġcredit',\n", " 'Ġdetermined',\n", " 'Ġunder',\n", " 'Ġthis',\n", " 'Ġsection',\n", " 'Ġfor',\n", " 'Ġthe',\n", " 'Ġtaxable',\n", " 'Ġyear',\n", " 'Ġis',\n", " 'Ġan',\n", " 'Ġamount',\n", " 'Ġequal',\n", " 'Ġto',\n", " 'Ġ100',\n", " 'Ġpercent',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġqualified',\n", " 'ĠSTEM',\n", " 'Ġcontributions',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġtaxpayer',\n", " 'Ġfor',\n", " 'Ġsuch',\n", " 'Ġtaxable',\n", " 'Ġyear',\n", " '.',\n", " 'Ġ``(',\n", " 'b',\n", " ')',\n", " 'ĠQual',\n", " 'ified',\n", " 'ĠSTEM',\n", " 'ĠContributions',\n", " '.--',\n", " 'For',\n", " 'Ġpurposes',\n", " 'Ġof',\n", " 'Ġthis',\n", " 'Ġsection',\n", " ',',\n", " 'Ġthe',\n", " 'Ġterm',\n", " 'Ġ`',\n", " 'qualified',\n", " 'ĠSTEM',\n", " 'Ġcontributions',\n", " \"'\",\n", " 'Ġmeans',\n", " '--',\n", " 'Ġ``(',\n", " '1',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġschool',\n", " 'Ġcontributions',\n", " ',',\n", " 'Ġ``(',\n", " '2',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġteacher',\n", " 'Ġex',\n", " 'tern',\n", " 'ship',\n", " 'Ġexpenses',\n", " ',',\n", " 'Ġand',\n", " 'Ġ``(',\n", " '3',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġteacher',\n", " 'Ġtraining',\n", " 'Ġexpenses',\n", " '.',\n", " 'Ġ``(',\n", " 'c',\n", " ')',\n", " 'ĠSTEM',\n", " 'ĠSchool',\n", " 'ĠContributions',\n", " '.--',\n", " 'For',\n", " 'Ġpurposes',\n", " 'Ġof',\n", " 'Ġthis',\n", " 'Ġsection',\n", " '--',\n", " 'Ġ``(',\n", " '1',\n", " ')',\n", " 'ĠIn',\n", " 'Ġgeneral',\n", " '.--',\n", " 'The',\n", " 'Ġterm',\n", " 'Ġ`',\n", " 'STEM',\n", " 'Ġschool',\n", " 'Ġcontributions',\n", " \"'\",\n", " 'Ġmeans',\n", " '--',\n", " 'Ġ``(',\n", " 'A',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġproperty',\n", " 'Ġcontributions',\n", " ',',\n", " 'Ġand',\n", " 'Ġ``(',\n", " 'B',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġservice',\n", " 'Ġcontributions',\n", " '.',\n", " 'Ġ``(',\n", " '2',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġproperty',\n", " 'Ġcontributions',\n", " '.--',\n", " 'The',\n", " 'Ġterm',\n", " 'Ġ`',\n", " 'STEM',\n", " 'Ġproperty',\n", " 'Ġcontributions',\n", " \"'\",\n", " 'Ġmeans',\n", " 'Ġthe',\n", " 'Ġamount',\n", " 'Ġwhich',\n", " 'Ġwould',\n", " 'Ġ(',\n", " 'but',\n", " 'Ġfor',\n", " 'Ġsubsection',\n", " 'Ġ(',\n", " 'f',\n", " '))',\n", " 'Ġbe',\n", " 'Ġallowed',\n", " 'Ġas',\n", " 'Ġa',\n", " 'Ġdeduction',\n", " 'Ġunder',\n", " 'Ġsection',\n", " 'Ġ170',\n", " 'Ġfor',\n", " 'Ġa',\n", " 'Ġcharitable',\n", " 'Ġcontribution',\n", " 'Ġof',\n", " 'ĠSTEM',\n", " 'Ġinventory',\n", " 'Ġproperty',\n", " 'Ġif',\n", " '--',\n", " 'Ġ``(',\n", " 'A',\n", " ')',\n", " 'Ġthe',\n", " 'Ġdone',\n", " 'e',\n", " 'Ġis',\n", " 'Ġan',\n", " 'Ġelementary',\n", " 'Ġor',\n", " 'Ġsecondary',\n", " 'Ġschool',\n", " 'Ġdescribed',\n", " 'Ġin',\n", " 'Ġsection',\n", " 'Ġ170',\n", " '(',\n", " 'b',\n", " ')(',\n", " '1',\n", " ')(',\n", " 'A',\n", " ')(',\n", " 'ii',\n", " '),',\n", " 'Ġ``(',\n", " 'B',\n", " ')',\n", " 'Ġsubstantially',\n", " 'Ġall',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġuse',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġproperty',\n", " 'Ġby',\n", " 'Ġthe',\n", " 'Ġdone',\n", " 'e',\n", " 'Ġis',\n", " 'Ġwithin',\n", " 'Ġthe',\n", " 'ĠUnited',\n", " 'ĠStates',\n", " 'Ġor',\n", " 'Ġwithin',\n", " 'Ġthe',\n", " 'Ġdefense',\n", " 'Ġdepend',\n", " 'ents',\n", " \"'\",\n", " 'Ġeducation',\n", " 'Ġsystem',\n", " 'Ġfor',\n", " 'Ġeducational',\n", " 'Ġpurposes',\n", " 'Ġin',\n", " 'Ġany',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġgrades',\n", " 'ĠK',\n", " '-',\n", " '12',\n", " 'Ġthat',\n", " 'Ġare',\n", " 'Ġrelated',\n", " 'Ġto',\n", " 'Ġthe',\n", " 'Ġpurpose',\n", " 'Ġor',\n", " 'Ġfunction',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġdone',\n", " 'e',\n", " ',',\n", " 'Ġ``(',\n", " 'C',\n", " ')',\n", " 'Ġthe',\n", " 'Ġoriginal',\n", " 'Ġuse',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġproperty',\n", " 'Ġbegins',\n", " 'Ġwith',\n", " 'Ġthe',\n", " 'Ġdone',\n", " 'e',\n", " ',',\n", " 'Ġ``(',\n", " 'D',\n", " ')',\n", " 'Ġthe',\n", " 'Ġproperty',\n", " 'Ġwill',\n", " 'Ġfit',\n", " 'Ġproduct',\n", " 'ively',\n", " 'Ġinto',\n", " 'Ġthe',\n", " 'Ġdone',\n", " 'e',\n", " \"'s\",\n", " 'Ġeducation',\n", " 'Ġplan',\n", " ',',\n", " 'Ġ``(',\n", " 'E',\n", " ')',\n", " 'Ġthe',\n", " 'Ġproperty',\n", " 'Ġis',\n", " 'Ġnot',\n", " 'Ġtransferred',\n", " 'Ġby',\n", " 'Ġthe',\n", " 'Ġdone',\n", " 'e',\n", " 'Ġin',\n", " 'Ġexchange',\n", " 'Ġfor',\n", " 'Ġmoney',\n", " ',',\n", " 'Ġother',\n", " 'Ġproperty',\n", " ',',\n", " 'Ġor',\n", " 'Ġservices',\n", " ',',\n", " 'Ġexcept',\n", " 'Ġfor',\n", " 'Ġshipping',\n", " ',',\n", " 'Ġinstallation',\n", " 'Ġand',\n", " 'Ġtransfer',\n", " 'Ġcosts',\n", " ',',\n", " 'Ġand',\n", " 'Ġ``(',\n", " 'F',\n", " ')',\n", " 'Ġthe',\n", " 'Ġdone',\n", " 'e',\n", " \"'s\",\n", " 'Ġuse',\n", " 'Ġand',\n", " 'Ġdisposition',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġproperty',\n", " 'Ġwill',\n", " 'Ġbe',\n", " 'Ġin',\n", " 'Ġaccordance',\n", " 'Ġwith',\n", " 'Ġthe',\n", " 'Ġprovisions',\n", " 'Ġof',\n", " 'Ġsubparagraph',\n", " 's',\n", " 'Ġ(',\n", " 'B',\n", " ')',\n", " 'Ġand',\n", " 'Ġ(',\n", " 'E',\n", " ').',\n", " 'ĠThe',\n", " 'Ġdetermination',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġamount',\n", " 'Ġof',\n", " 'Ġdeduction',\n", " 'Ġunder',\n", " 'Ġsection',\n", " 'Ġ170',\n", " 'Ġfor',\n", " 'Ġpurposes',\n", " 'Ġof',\n", " 'Ġthis',\n", " 'Ġparagraph',\n", " 'Ġshall',\n", " 'Ġbe',\n", " 'Ġmade',\n", " 'Ġas',\n", " 'Ġif',\n", " 'Ġthe',\n", " 'Ġlimitation',\n", " 'Ġunder',\n", " 'Ġsection',\n", " 'Ġ170',\n", " '(',\n", " 'e',\n", " ')(',\n", " '3',\n", " ')(',\n", " 'B',\n", " ')',\n", " 'Ġapplied',\n", " 'Ġto',\n", " 'Ġall',\n", " 'ĠSTEM',\n", " 'Ġinventory',\n", " 'Ġproperty',\n", " '.',\n", " 'Ġ``(',\n", " '3',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġservice',\n", " 'Ġcontributions',\n", " '.--',\n", " 'The',\n", " 'Ġterm',\n", " 'Ġ`',\n", " 'STEM',\n", " 'Ġservice',\n", " 'Ġcontributions',\n", " \"'\",\n", " 'Ġmeans',\n", " 'Ġthe',\n", " 'Ġamount',\n", " 'Ġpaid',\n", " 'Ġor',\n", " 'Ġincurred',\n", " 'Ġduring',\n", " 'Ġthe',\n", " 'Ġtaxable',\n", " 'Ġyear',\n", " 'Ġfor',\n", " 'ĠSTEM',\n", " 'Ġservices',\n", " 'Ġprovided',\n", " 'Ġin',\n", " 'Ġthe',\n", " 'ĠUnited',\n", " 'ĠStates',\n", " 'Ġor',\n", " 'Ġin',\n", " 'Ġthe',\n", " 'Ġdefense',\n", " 'Ġdepend',\n", " 'ents',\n", " \"'\",\n", " 'Ġeducation',\n", " 'Ġsystem',\n", " 'Ġfor',\n", " 'Ġthe',\n", " 'Ġexclusive',\n", " 'Ġbenefit',\n", " 'Ġof',\n", " 'Ġstudents',\n", " 'Ġat',\n", " 'Ġan',\n", " 'Ġelementary',\n", " 'Ġor',\n", " 'Ġsecondary',\n", " 'Ġschool',\n", " 'Ġdescribed',\n", " 'Ġin',\n", " 'Ġsection',\n", " 'Ġ170',\n", " '(',\n", " 'b',\n", " ')(',\n", " '1',\n", " ')(',\n", " 'A',\n", " ')(',\n", " 'ii',\n", " ')',\n", " 'Ġbut',\n", " 'Ġonly',\n", " 'Ġif',\n", " '--',\n", " 'Ġ``(',\n", " 'A',\n", " ')',\n", " 'Ġthe',\n", " 'Ġtaxpayer',\n", " 'Ġis',\n", " 'Ġengaged',\n", " 'Ġin',\n", " 'Ġthe',\n", " 'Ġtrade',\n", " 'Ġor',\n", " 'Ġbusiness',\n", " 'Ġof',\n", " 'Ġproviding',\n", " 'Ġsuch',\n", " 'Ġservices',\n", " 'Ġon',\n", " 'Ġa',\n", " 'Ġcommercial',\n", " 'Ġbasis',\n", " ',',\n", " 'Ġand',\n", " 'Ġ``(',\n", " 'B',\n", " ')',\n", " 'Ġno',\n", " 'Ġcharge',\n", " 'Ġis',\n", " 'Ġimposed',\n", " 'Ġfor',\n", " 'Ġproviding',\n", " 'Ġsuch',\n", " 'Ġservices',\n", " '.',\n", " 'Ġ``(',\n", " '4',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġinventory',\n", " 'Ġproperty',\n", " '.--',\n", " 'The',\n", " 'Ġterm',\n", " 'Ġ`',\n", " 'STEM',\n", " 'Ġinventory',\n", " 'Ġproperty',\n", " \"'\",\n", " 'Ġmeans',\n", " ',',\n", " 'Ġwith',\n", " 'Ġrespect',\n", " 'Ġto',\n", " 'Ġany',\n", " 'Ġcontribution',\n", " 'Ġto',\n", " 'Ġa',\n", " 'Ġschool',\n", " ',',\n", " 'Ġany',\n", " 'Ġproperty',\n", " '--',\n", " 'Ġ``(',\n", " 'A',\n", " ')',\n", " 'Ġwhich',\n", " 'Ġis',\n", " 'Ġdescribed',\n", " 'Ġin',\n", " 'Ġparagraph',\n", " 'Ġ(',\n", " '1',\n", " ')',\n", " 'Ġor',\n", " 'Ġ(',\n", " '2',\n", " ')',\n", " 'Ġof',\n", " 'Ġsection',\n", " 'Ġ12',\n", " '21',\n", " '(',\n", " 'a',\n", " ')',\n", " 'Ġwith',\n", " 'Ġrespect',\n", " 'Ġto',\n", " 'Ġthe',\n", " 'Ġdonor',\n", " ',',\n", " 'Ġand',\n", " 'Ġ``(',\n", " 'B',\n", " ')',\n", " 'Ġwhich',\n", " 'Ġis',\n", " 'Ġdetermined',\n", " 'Ġby',\n", " 'Ġthe',\n", " 'Ġschool',\n", " 'Ġto',\n", " 'Ġbe',\n", " 'Ġneeded',\n", " 'Ġby',\n", " 'Ġthe',\n", " 'Ġschool',\n", " 'Ġin',\n", " 'Ġproviding',\n", " 'Ġeducation',\n", " 'Ġin',\n", " 'Ġgrades',\n", " 'ĠK',\n", " '-',\n", " '12',\n", " 'Ġin',\n", " 'Ġthe',\n", " 'Ġareas',\n", " 'Ġof',\n", " 'Ġscience',\n", " ',',\n", " 'Ġtechnology',\n", " ',',\n", " 'Ġengineering',\n", " ',',\n", " 'Ġor',\n", " 'Ġmathematics',\n", " '.',\n", " 'Ġ``(',\n", " '5',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġservices',\n", " '.--',\n", " 'The',\n", " 'Ġterm',\n", " 'Ġ`',\n", " 'STEM',\n", " 'Ġservices',\n", " \"'\",\n", " 'Ġmeans',\n", " ',',\n", " 'Ġwith',\n", " 'Ġrespect',\n", " 'Ġto',\n", " 'Ġany',\n", " 'Ġcontribution',\n", " 'Ġto',\n", " 'Ġa',\n", " 'Ġschool',\n", " ',',\n", " 'Ġany',\n", " 'Ġservice',\n", " 'Ġdetermined',\n", " 'Ġby',\n", " 'Ġthe',\n", " 'Ġschool',\n", " 'Ġto',\n", " 'Ġbe',\n", " 'Ġneeded',\n", " 'Ġby',\n", " 'Ġthe',\n", " 'Ġschool',\n", " 'Ġin',\n", " 'Ġproviding',\n", " 'Ġeducation',\n", " 'Ġin',\n", " 'Ġgrades',\n", " 'ĠK',\n", " '-',\n", " '12',\n", " 'Ġin',\n", " 'Ġthe',\n", " 'Ġareas',\n", " 'Ġof',\n", " 'Ġscience',\n", " ',',\n", " 'Ġtechnology',\n", " ',',\n", " 'Ġengineering',\n", " ',',\n", " 'Ġor',\n", " 'Ġmathematics',\n", " ',',\n", " 'Ġincluding',\n", " 'Ġteaching',\n", " 'Ġcourses',\n", " 'Ġof',\n", " 'Ġinstruction',\n", " 'Ġat',\n", " 'Ġsuch',\n", " 'Ġschool',\n", " 'Ġin',\n", " 'Ġany',\n", " 'Ġsuch',\n", " 'Ġarea',\n", " '.',\n", " 'Ġ``(',\n", " '6',\n", " ')',\n", " 'ĠDefense',\n", " 'Ġdepend',\n", " 'ents',\n", " \"'\",\n", " 'Ġeducation',\n", " 'Ġsystem',\n", " '.--',\n", " 'For',\n", " 'Ġpurposes',\n", " 'Ġof',\n", " 'Ġthis',\n", " 'Ġsubsection',\n", " ',',\n", " 'Ġthe',\n", " 'Ġterm',\n", " 'Ġ`',\n", " 'defense',\n", " 'Ġdepend',\n", " 'ents',\n", " \"'\",\n", " 'Ġeducation',\n", " 'Ġsystem',\n", " \"'\",\n", " 'Ġmeans',\n", " 'Ġthe',\n", " 'Ġprogram',\n", " 'Ġestablished',\n", " 'Ġand',\n", " 'Ġoperated',\n", " 'Ġunder',\n", " 'Ġthe',\n", " 'ĠDefense',\n", " 'ĠDepend',\n", " 'ents',\n", " \"'\",\n", " 'ĠEducation',\n", " 'ĠAct',\n", " 'Ġof',\n", " 'Ġ1978',\n", " 'Ġ(',\n", " '20',\n", " 'ĠU',\n", " '.',\n", " 'S',\n", " '.',\n", " 'C',\n", " '.',\n", " 'Ġ9',\n", " '21',\n", " 'Ġet',\n", " 'Ġseq',\n", " '.).',\n", " 'Ġ``(',\n", " 'd',\n", " ')',\n", " 'ĠSTEM',\n", " 'ĠTeacher',\n", " 'ĠEx',\n", " 'tern',\n", " 'ship',\n", " 'ĠExp',\n", " 'enses',\n", " '.--',\n", " 'For',\n", " 'Ġpurposes',\n", " 'Ġof',\n", " 'Ġthis',\n", " 'Ġsection',\n", " '--',\n", " 'Ġ``(',\n", " '1',\n", " ')',\n", " 'ĠIn',\n", " 'Ġgeneral',\n", " '.--',\n", " 'The',\n", " 'Ġterm',\n", " 'Ġ`',\n", " 'STEM',\n", " 'Ġteacher',\n", " 'Ġex',\n", " 'tern',\n", " 'ship',\n", " 'Ġexpenses',\n", " \"'\",\n", " 'Ġmeans',\n", " 'Ġany',\n", " 'Ġamount',\n", " 'Ġpaid',\n", " 'Ġor',\n", " 'Ġincurred',\n", " 'Ġto',\n", " 'Ġcarry',\n", " 'Ġout',\n", " 'Ġa',\n", " 'ĠSTEM',\n", " 'Ġex',\n", " 'tern',\n", " 'ship',\n", " 'Ġprogram',\n", " 'Ġof',\n", " 'Ġthe',\n", " 'Ġtaxpayer',\n", " 'Ġbut',\n", " 'Ġonly',\n", " 'Ġto',\n", " 'Ġthe',\n", " 'Ġextent',\n", " 'Ġthat',\n", " 'Ġsuch',\n", " 'Ġamount',\n", " 'Ġis',\n", " 'Ġattributable',\n", " 'Ġto',\n", " 'Ġthe',\n", " 'Ġparticipation',\n", " 'Ġin',\n", " 'Ġsuch',\n", " 'Ġprogram',\n", " 'Ġof',\n", " 'Ġany',\n", " 'Ġeligible',\n", " 'ĠSTEM',\n", " 'Ġteacher',\n", " ',',\n", " 'Ġincluding',\n", " 'Ġamounts',\n", " 'Ġpaid',\n", " 'Ġto',\n", " 'Ġsuch',\n", " 'Ġa',\n", " 'Ġteacher',\n", " 'Ġas',\n", " 'Ġa',\n", " 'Ġstip',\n", " 'end',\n", " 'Ġwhile',\n", " 'Ġparticipating',\n", " 'Ġin',\n", " 'Ġsuch',\n", " 'Ġprogram',\n", " '.',\n", " 'Ġ``(',\n", " '2',\n", " ')',\n", " 'ĠSTEM',\n", " 'Ġex',\n", " 'tern',\n", " 'ship',\n", " 'Ġprogram',\n", " ...]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "understand_tokenization = tokenizer.tokenize(df_bills.text[0])\n", "understand_tokenization" ] }, { "cell_type": "code", "execution_count": 10, "id": "c20c1633-abda-45e4-9807-f531131fa572", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1480" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(understand_tokenization)" ] }, { "cell_type": "code", "execution_count": 11, "id": "99f39523-4d1c-49fc-b29e-a0b71a70cfad", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\contoso\\AppData\\Local\\Temp\\ipykernel_50576\\1748282169.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_bills['curie_search'] = df_bills[\"text\"].apply(lambda x : get_embedding(x, engine = 'text-search-curie-doc-001'))\n" ] } ], "source": [ "df_bills['curie_search'] = df_bills[\"text\"].apply(lambda x : get_embedding(x, engine = 'text-search-curie-doc-001'))" ] }, { "cell_type": "code", "execution_count": 12, "id": "234c98c2-66ee-4b00-8603-d2fd8031e494", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsummarytitlen_tokenscurie_search
0SECTION 1. SHORT TITLE. This Act may be cited ...National Science Education Tax Incentive for B...To amend the Internal Revenue Code of 1986 to ...1480[-0.019770914688706398, 0.011169900186359882, ...
1SECTION 1. SHORT TITLE. This Act may be cited ...Small Business Expansion and Hiring Act of 201...To amend the Internal Revenue Code of 1986 to ...1152[-0.007850012741982937, 0.01001765951514244, 0...
2SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR...Requires the Director of National Intelligence...A bill to require the Director of National Int...930[0.00012103027984267101, 0.011845593340694904,...
4SECTION 1. SHORT TITLE. This Act may be cited ...Military Call-up Relief Act - Amends the Inter...A bill to amend the Internal Revenue Code of 1...1048[-0.005481021944433451, 0.00856819562613964, -...
5SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR...Requires the Customs Service to reliquidate ce...To provide for reliquidation of entries premat...1846[-0.008310390636324883, -0.004660653416067362,...
6SECTION 1. SHORT TITLE. This Act may be cited ...Service Dogs for Veterans Act of 2009 - Direct...A bill to require the Secretary of Veterans Af...872[-0.017687108367681503, 0.011164870113134384, ...
9SECTION 1. SHORT TITLE. This Act may be cited ...Taxpayer's Right to View Act of 1993 - Amends ...Taxpayer's Right to View Act of 1993946[0.0021867561154067516, -0.004219848196953535,...
12SECTION 1. FINDINGS. The Congress finds the fo...Amends the Marine Mammal Protection Act of 197...To amend the Marine Mammal Protection Act of 1...1223[-0.015813011676073074, 0.009919906966388226, ...
14SECTION 1. SHORT TITLE. This Act may be cited ...Education and Training for Health Act of 2017 ...Education and Training for Health Act of 20171596[-0.0150684155523777, 0.005073960404843092, 0....
16SECTION 1. SHORT TITLE. This Act may be cited ...Andrew Prior Act or Andrew's Law - Amends the ...Andrew's Law608[-0.011593054980039597, 0.022752899676561356, ...
17SECTION 1. SHORT TITLE. This Act may be cited ...Directs the President, in coordination with de...Energy Independence Act of 20001341[-0.008348068222403526, 0.00272438395768404, 0...
18SECTION 1. SHORT TITLE. This Act may be cited ...This measure has not been amended since it was...Veterans Entrepreneurship Act of 20151404[-0.020315825939178467, 0.0011716989101842046,...
\n", "
" ], "text/plain": [ " text \\\n", "0 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "1 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "2 SECTION 1. RELEASE OF DOCUMENTS CAPTURED IN IR... \n", "4 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "5 SECTION 1. RELIQUIDATION OF CERTAIN ENTRIES PR... \n", "6 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "9 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "12 SECTION 1. FINDINGS. The Congress finds the fo... \n", "14 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "16 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "17 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "18 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "\n", " summary \\\n", "0 National Science Education Tax Incentive for B... \n", "1 Small Business Expansion and Hiring Act of 201... \n", "2 Requires the Director of National Intelligence... \n", "4 Military Call-up Relief Act - Amends the Inter... \n", "5 Requires the Customs Service to reliquidate ce... \n", "6 Service Dogs for Veterans Act of 2009 - Direct... \n", "9 Taxpayer's Right to View Act of 1993 - Amends ... \n", "12 Amends the Marine Mammal Protection Act of 197... \n", "14 Education and Training for Health Act of 2017 ... \n", "16 Andrew Prior Act or Andrew's Law - Amends the ... \n", "17 Directs the President, in coordination with de... \n", "18 This measure has not been amended since it was... \n", "\n", " title n_tokens \\\n", "0 To amend the Internal Revenue Code of 1986 to ... 1480 \n", "1 To amend the Internal Revenue Code of 1986 to ... 1152 \n", "2 A bill to require the Director of National Int... 930 \n", "4 A bill to amend the Internal Revenue Code of 1... 1048 \n", "5 To provide for reliquidation of entries premat... 1846 \n", "6 A bill to require the Secretary of Veterans Af... 872 \n", "9 Taxpayer's Right to View Act of 1993 946 \n", "12 To amend the Marine Mammal Protection Act of 1... 1223 \n", "14 Education and Training for Health Act of 2017 1596 \n", "16 Andrew's Law 608 \n", "17 Energy Independence Act of 2000 1341 \n", "18 Veterans Entrepreneurship Act of 2015 1404 \n", "\n", " curie_search \n", "0 [-0.019770914688706398, 0.011169900186359882, ... \n", "1 [-0.007850012741982937, 0.01001765951514244, 0... \n", "2 [0.00012103027984267101, 0.011845593340694904,... \n", "4 [-0.005481021944433451, 0.00856819562613964, -... \n", "5 [-0.008310390636324883, -0.004660653416067362,... \n", "6 [-0.017687108367681503, 0.011164870113134384, ... \n", "9 [0.0021867561154067516, -0.004219848196953535,... \n", "12 [-0.015813011676073074, 0.009919906966388226, ... \n", "14 [-0.0150684155523777, 0.005073960404843092, 0.... \n", "16 [-0.011593054980039597, 0.022752899676561356, ... \n", "17 [-0.008348068222403526, 0.00272438395768404, 0... \n", "18 [-0.020315825939178467, 0.0011716989101842046,... " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_bills" ] }, { "cell_type": "code", "execution_count": 13, "id": "d6a0c62c-ad9f-4b6d-98cc-81d1c751106e", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\contoso\\AppData\\Local\\Temp\\ipykernel_50576\\1092181356.py:7: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df[\"similarities\"] = df.curie_search.apply(lambda x: cosine_similarity(x, embedding))\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
textsummarytitlen_tokenscurie_searchsimilarities
9SECTION 1. SHORT TITLE. This Act may be cited ...Taxpayer's Right to View Act of 1993 - Amends ...Taxpayer's Right to View Act of 1993946[0.0021867561154067516, -0.004219848196953535,...0.363270
0SECTION 1. SHORT TITLE. This Act may be cited ...National Science Education Tax Incentive for B...To amend the Internal Revenue Code of 1986 to ...1480[-0.019770914688706398, 0.011169900186359882, ...0.314105
1SECTION 1. SHORT TITLE. This Act may be cited ...Small Business Expansion and Hiring Act of 201...To amend the Internal Revenue Code of 1986 to ...1152[-0.007850012741982937, 0.01001765951514244, 0...0.297908
18SECTION 1. SHORT TITLE. This Act may be cited ...This measure has not been amended since it was...Veterans Entrepreneurship Act of 20151404[-0.020315825939178467, 0.0011716989101842046,...0.295586
\n", "
" ], "text/plain": [ " text \\\n", "9 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "0 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "1 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "18 SECTION 1. SHORT TITLE. This Act may be cited ... \n", "\n", " summary \\\n", "9 Taxpayer's Right to View Act of 1993 - Amends ... \n", "0 National Science Education Tax Incentive for B... \n", "1 Small Business Expansion and Hiring Act of 201... \n", "18 This measure has not been amended since it was... \n", "\n", " title n_tokens \\\n", "9 Taxpayer's Right to View Act of 1993 946 \n", "0 To amend the Internal Revenue Code of 1986 to ... 1480 \n", "1 To amend the Internal Revenue Code of 1986 to ... 1152 \n", "18 Veterans Entrepreneurship Act of 2015 1404 \n", "\n", " curie_search similarities \n", "9 [0.0021867561154067516, -0.004219848196953535,... 0.363270 \n", "0 [-0.019770914688706398, 0.011169900186359882, ... 0.314105 \n", "1 [-0.007850012741982937, 0.01001765951514244, 0... 0.297908 \n", "18 [-0.020315825939178467, 0.0011716989101842046,... 0.295586 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# search through the reviews for a specific product\n", "def search_docs(df, user_query, top_n=3, to_print=True):\n", " embedding = get_embedding(\n", " user_query,\n", " engine=\"text-search-curie-query-001\"\n", " )\n", " df[\"similarities\"] = df.curie_search.apply(lambda x: cosine_similarity(x, embedding))\n", "\n", " res = (\n", " df.sort_values(\"similarities\", ascending=False)\n", " .head(top_n)\n", " )\n", " if to_print:\n", " display(res)\n", " return res\n", "\n", "\n", "res = search_docs(df_bills, \"can i get information on cable company tax revenue\", top_n=4)" ] }, { "cell_type": "code", "execution_count": 14, "id": "a126cb58-9dd7-486c-bfab-0663a6504451", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"Taxpayer's Right to View Act of 1993 - Amends the Communications Act of 1934 to prohibit a cable operator from assessing separate charges for any video programming of a sporting, theatrical, or other entertainment event if that event is performed at a facility constructed, renovated, or maintained with tax revenues or by an organization that receives public financial support. Authorizes the Federal Communications Commission and local franchising authorities to make determinations concerning the applicability of such prohibition. Sets forth conditions under which a facility is considered to have been constructed, maintained, or renovated with tax revenues. Considers events performed by nonprofit or public organizations that receive tax subsidies to be subject to this Act if the event is sponsored by, or includes the participation of a team that is part of, a tax exempt organization.\"" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res[\"summary\"][9]" ] }, { "cell_type": "code", "execution_count": null, "id": "9d555ba0-395e-4d19-a517-7b89b3867125", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.7" } }, "nbformat": 4, "nbformat_minor": 5 }