{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import re\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv'\n", "t1 = pd.read_csv(fn, header=0)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from tidylib import tidy_document\n", "import html2text" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def cleanup(txt):\n", " doc, errors = tidy_document(str(txt))\n", " doc = html2text.html2text(doc)\n", " return doc.replace('*','').replace('_','').lower().strip()\n", " \n", "def add_clean_text(row):\n", " row['scrape'] = cleanup(row.AdditionalDescription)\n", " return row" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "t1_fix = t1.apply(add_clean_text,1)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Mayor's Office of Contract Services 67\n", "Community Boards 36\n", "Citywide Administrative Services 35\n", "Housing Preservation and Development 18\n", "Landmarks Preservation Commission 18\n", "Human Resources Administration 17\n", "Transportation 16\n", "Health and Mental Hygiene 15\n", "Administration for Children's Services 14\n", "City Planning 13\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# I only care about these columns for now\n", "cols = ['RequestID', 'ConfirmationNumber', 'AgencyCode', 'AgencyName',\n", " 'AgencyDivision', 'SectionID', 'SectionName', 'scrape']\n", "\n", "fixed = t1_fix[cols]\n", "\n", "# top 10: breakdown of ads by agencies\n", "fixed['AgencyName'].value_counts()[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Focus on parsing the Mayor's Office of Contract of Services ads.**" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# select Office of Contract Services Ads\n", "mocs = fixed['AgencyName'] == \"Mayor's Office of Contract Services\"\n", "mocs_ads = fixed[mocs][['RequestID', 'scrape']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Example entries**\n", "\n", " __public meeting notice__\n", " \n", " public notice is hereby given that the \n", " franchise and concession review committee will hold a \n", " public meeting on wednesday, october 8, 2014 at 2:30 p.m., \n", " at 22 reade street, spector hall, borough of manhattan. \n", " \n", "\n", "Fields to extract | description | Parsing Status\n", "------------------| -----------------| --------------\n", "datetime | meeting datetime | working\n", "\n", "__Notice of intent to extend contract__\n", "\n", " vendor: accenture llp\n", " description of services: design, development and deployment of application\n", " enhancements and extensions to the existing apt system along with the\n", " appropriate documentation required.\n", "\n", " award method of original contract: intergovernmental\n", " fms contract type: consultant\n", " end date of original contract: 1/31/2015 \n", " method of renewal/extenction the agency intends to utilize: extension\n", " new start date of the proposed renewed/extended contract: 2/1/15\n", " new end date of the proposed renewed/extended contract: 7/31/15\n", " modifications sought to the nature of services performed under the contract: none\n", " reason(s) the agency intends to renew/extend the contract: continuation of services\n", " personnel in substantially similar titles within agency: apt project manager –\n", " 1; apt technical lead – 1; apt developer - 2\n", " headcount of personnel in substantially similar titles within agency: 4\n", "\n", "\n", "Fields to extract |desc | Parsing Status\n", "----------------------------------------------------------------------------|-----|---------------\n", "vendor | - | needs test \n", "description of services | - | needs test \n", "award method of origian contract | - | needs test \n", "fms contract type | - | needs test \n", "end date of original contract | - | needs test \n", "method of renewalextension | - | needs test \n", "new start date of proposed renewed/extended contract | - | needs test \n", "new end date of proposed renewed/extended contract | - | needs test \n", "modifications sought to the nature of services performed under the contract | - | needs test \n", "reason(s) the agency intends to renew/extend the contract | - | needs test \n", "personnel in substantially similar titles within agency | - | debugging & needs test \n", "headcount of personnel in substantially similar titles within agency | - | needs test \n", "\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# build a regex for public meeting notice\n", "rex_time = 'on\\s+(?P[^\\s,]+)[\\s,]\\s+(?P[^\\s,]+)\\s+(?P\\d+)[,\\s]+(?P\\d+)\\s+(at)?\\s*(?P\\d+):(?P\\d+)\\s+(?P\\w+\\.?\\w+\\.?)'\n", "rex_time = re.compile(rex_time, re.IGNORECASE|re.DOTALL|re.MULTILINE)\n", "\n", "# build regexes to parse Notice of intent to extend contract ads\n", "extend_contract_keys = [\n", " 'vendor',\n", " 'description of services', \n", " 'award method of origian contract',\n", " 'fms contract type',\n", " 'end date of original contract',\n", " 'method of renewalextension',\n", " 'new start date of proposed renewed/extended contract',\n", " 'new end date of proposed renewed/extended contract',\n", " 'modifications sought to the nature of services performed under the contract',\n", " 'reason(s) the agency intends to renew/extend the contract',\n", " 'personnel in substantially similar titles within agency',\n", " 'headcount of personnel in substantially similar titles within agency'\n", "]\n", "\n", "rex_mm_dd_year = '(?P\\d{1,2})/(?P\\d{1,2})/(?P\\d{2,4})'\n", "rex_extend_contract = {\n", " 'vendor' : 'vendor:\\s+(?P[^\\n]+$)\\n',\n", " 'description of services' : 'description\\sof\\sservices:\\s+(?P.+)\\n+award',\n", " 'award method of origian contract': 'award\\smethod\\sof\\soriginal\\scontract:\\s+(?P[^\\n]+)',\n", " 'fms contract type' : 'fms\\scontract\\stype:\\s+(?P[^\\n]+$)\\n',\n", " 'end date of original contract': 'end\\sdate\\sof\\soriginal\\scontract:\\s' + rex_mm_dd_year,\n", " 'method of renewalextension': 'method\\sof\\srenewal/extension[^:]+:\\s(?P[^\\n]+$)\\n',\n", " 'new start date of proposed renewed/extended contract': 'new\\sstart\\sdate\\sof\\sthe\\sproposed\\srenewed.extended\\scontract:\\s' + rex_mm_dd_year,\n", " 'new end date of proposed renewed/extended contract': 'new\\send\\sdate\\sof\\sthe\\sproposed\\srenewed.extended\\scontract:\\s' + rex_mm_dd_year,\n", " 'modifications sought to the nature of services performed under the contract': 'modifications\\ssought\\sto\\sthe\\snature\\sof\\sservices\\sperformed\\sunder\\sthe\\scontract:\\s+(?P.+)\\n+reason',\n", " 'reason(s) the agency intends to renew/extend the contract': 'reason\\(s\\)\\sthe\\sagency\\sintends\\sto\\srenew/extend\\sthe\\scontract:\\s(?P.+)\\n+personnel',\n", " \n", " # todo - debug this regex. not working on all the dataset\n", " # \n", " 'personnel in substantially similar titles within agency': '^personnel\\sin\\ssubstantially\\ssimilar\\stitles\\swithin\\sagency:\\s+(?P(none|.+))\\n(headcount)?',\n", " 'headcount of personnel in substantially similar titles within agency': 'headcount\\sof\\spersonnel\\sin\\ssubstantially\\ssimilar\\stitles\\swithin\\sagency:\\s+(?P\\d+)\\n'\n", "}\n", "for k,v in rex_extend_contract.items():\n", " rex_extend_contract[k] = re.compile(v, re.IGNORECASE|re.MULTILINE|re.DOTALL)\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "mocs = fixed['AgencyName'] == \"Mayor's Office of Contract Services\"\n", "mocs_ads = fixed[mocs][['RequestID', 'scrape']]\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Extracted: [('wednesday', 'october', '8', '2014', 'at', '2', '30', 'p.m.')]\n", "--------\n", "\n", "public notice is hereby given that the \n", "\n", " franchise and concession review committee will hold a \n", "\n", " public meeting on wednesday\n", "--------\n", "\n", "Extracted: [('wednesday', 'december', '10', '2014', 'at', '2', '30', 'p.m.')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('apt project manager –\\n1; apt technical lead – 1; apt developer - 2\\n', 'apt project manager –\\n1; apt technical lead – 1; apt developer - 2\\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('architect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for the\\nreconstruction of de-commissioned buildings in the boroughs of brooklyn and\\nstaten island\\nstart date of the proposed contract: 12/1/2014\\nend date of the proposed contract: 1/1/2015\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for the\\nreconstruction of de-commissioned buildings in the boroughs of the bronx and\\nmanhattan\\nstart date of the proposed contract: 12/1/2014\\nend date of the proposed contract: 1/1/2015\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof corlears hook comfort station located in the borough of manhattan\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors\\nheadcount of personnel in substantially similar titles within agency: 85\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof nine comfort stations located in the boroughs of the bronx and manhattan\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors\\nheadcount of personnel in substantially similar titles within agency: 85\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof eight comfort stations located in the boroughs of brooklyn and staten\\nisland\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors', 'architect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for the\\nreconstruction of de-commissioned buildings in the boroughs of brooklyn and\\nstaten island\\nstart date of the proposed contract: 12/1/2014\\nend date of the proposed contract: 1/1/2015\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for the\\nreconstruction of de-commissioned buildings in the boroughs of the bronx and\\nmanhattan\\nstart date of the proposed contract: 12/1/2014\\nend date of the proposed contract: 1/1/2015\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof corlears hook comfort station located in the borough of manhattan\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors\\nheadcount of personnel in substantially similar titles within agency: 85\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof nine comfort stations located in the boroughs of the bronx and manhattan\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors\\nheadcount of personnel in substantially similar titles within agency: 85\\nagency: department of parks and recreation\\nnature of services sought: architectural design services for reconstruction\\nof eight comfort stations located in the boroughs of brooklyn and staten\\nisland\\nstart date of the proposed contract: 1/3/2015\\nend date of the proposed contract: 4/3/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineers, surveyors', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none \\n\\nheadcount of personnel in substantially similar titles within agency: none\\nagency: omb\\n\\nvendor: oac services inc\\n\\nnature of services: provide value engineering services on a task order basis\\n\\nmethod of renewal/extension the agency intends to utilize: renewal\\n\\nnew start date of the proposed renewed/extended contract: 3/1/2015\\n\\nnew end date of the proposed renewed/extended contract: 2/29/2016\\n\\nmodifications sought to the nature of services performed under the contract:\\nnone\\n\\nreason(s) the agency intends to renew/extend the contract: continued need for\\nservices\\n\\npersonnel in substantially similar titles within agency: none \\n', 'none \\n\\nheadcount of personnel in substantially similar titles within agency: none\\nagency: omb\\n\\nvendor: oac services inc\\n\\nnature of services: provide value engineering services on a task order basis\\n\\nmethod of renewal/extension the agency intends to utilize: renewal\\n\\nnew start date of the proposed renewed/extended contract: 3/1/2015\\n\\nnew end date of the proposed renewed/extended contract: 2/29/2016\\n\\nmodifications sought to the nature of services performed under the contract:\\nnone\\n\\nreason(s) the agency intends to renew/extend the contract: continued need for\\nservices\\n\\npersonnel in substantially similar titles within agency: none \\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none \\n', 'none \\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('civil engineer,\\nhighway & sewer inspector\\n', 'civil engineer,\\nhighway & sewer inspector\\n', 'headcount')]\n", "--------\n", "\n", "please see attached document.\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('accountant,\\nmanagement auditor, staff analyst, associate staff analyst, admin staff\\nanalyst nm\\n', 'accountant,\\nmanagement auditor, staff analyst, associate staff analyst, admin staff\\nanalyst nm\\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('civil engineer,\\nhighway & sewer inspector, landscape architect\\n', 'civil engineer,\\nhighway & sewer inspector, landscape architect\\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('project manager,\\nassociate project manager, construction project manager, construction project\\nmanager intern, civil engineers, assistant civil engineers\\n', 'project manager,\\nassociate project manager, construction project manager, construction project\\nmanager intern, civil engineers, assistant civil engineers\\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('project manager,\\nsenior project manager, senior manager, application support manager,\\napplication support lead, systems administrator, development manager,\\ntechnical lead, java developer, analytics architect manager, functional test\\nlead, functional tester.\\n', 'project manager,\\nsenior project manager, senior manager, application support manager,\\napplication support lead, systems administrator, development manager,\\ntechnical lead, java developer, analytics architect manager, functional test\\nlead, functional tester.\\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('project manager,\\nassociate project manager, construction project manager, construction project\\nmanager intern\\n', 'project manager,\\nassociate project manager, construction project manager, construction project\\nmanager intern\\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('landscape architect,\\nassistant\\n\\nlandscape architect, landscape architect intern, project manager, associate\\nproject manager', 'landscape architect,\\nassistant\\n\\nlandscape architect, landscape architect intern, project manager, associate\\nproject manager', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none \\n', 'none \\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "nan\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('civil engineers, assistant civil engineer, project manager, associated project\\nmanager, architect, architect intern, assistant architect, landmark\\npreservationist\\n\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation \\n\\nnature of service sought: architectural design services for bid documents for\\nthree (3) sets of modular units at midland beach, staten island and two (2)\\nsets of modular units at cedar grove beach, staten island\\n\\nstart date: 3/1/2015\\n\\nend date: 3/1/2016\\n\\nmethod of solicitation the agency intends to utilize: task order\\n\\npersonnel in substantially similar titles within agency:\\n\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineer\\n', 'civil engineers, assistant civil engineer, project manager, associated project\\nmanager, architect, architect intern, assistant architect, landmark\\npreservationist\\n\\nheadcount of personnel in substantially similar titles within agency: 78\\nagency: department of parks and recreation \\n\\nnature of service sought: architectural design services for bid documents for\\nthree (3) sets of modular units at midland beach, staten island and two (2)\\nsets of modular units at cedar grove beach, staten island\\n\\nstart date: 3/1/2015\\n\\nend date: 3/1/2016\\n\\nmethod of solicitation the agency intends to utilize: task order\\n\\npersonnel in substantially similar titles within agency:\\n\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, civil engineers, assistant civil\\nengineer\\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('civil engineer, assistant civil engineer, project manager, associate project\\nmanager, landscape architect, assistant landscape architect, landscape\\narchitect intern\\n', 'civil engineer, assistant civil engineer, project manager, associate project\\nmanager, landscape architect, assistant landscape architect, landscape\\narchitect intern\\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [(\"landscape architect, assistant landscape architect, landscape architect\\nintern. project manager, associate project manager, civil engineer, assistant\\ncivil engineer\\nheadcount of personnel in substantially similar titles within agency: 146\\nagency: department of parks and recreation\\n\\n \\n\\nnature of services sought: architectural design services to undertake a\\ncomprehensive plan for staten island's east shore parks\\nstart date: 6/1/2015\\nend date: 3/1/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, landscape architect, assistant\\nlandscape architect, landscape architect intern, civil engineer, assistant\\ncivil engineer\\nheadcount of personnel in substantially similar titles within agency: 161\\nagency: department of parks and recreation\\n\\n \\n\\nnature of services sought: engineering design services to undertake a\\ncomprehensive plan for staten island's east shore parks\\nstart date: 6/1/2015\\nend date: 3/1/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\ncivil engineer, assistant civil engineer, project manager, associate project\\nmanager, landscape architect, assistant landscape architect, landscape\\narchitect intern\", \"landscape architect, assistant landscape architect, landscape architect\\nintern. project manager, associate project manager, civil engineer, assistant\\ncivil engineer\\nheadcount of personnel in substantially similar titles within agency: 146\\nagency: department of parks and recreation\\n\\n \\n\\nnature of services sought: architectural design services to undertake a\\ncomprehensive plan for staten island's east shore parks\\nstart date: 6/1/2015\\nend date: 3/1/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\narchitect, architect intern, assistant architect, landmarks preservationist,\\nproject manager, associate project manager, landscape architect, assistant\\nlandscape architect, landscape architect intern, civil engineer, assistant\\ncivil engineer\\nheadcount of personnel in substantially similar titles within agency: 161\\nagency: department of parks and recreation\\n\\n \\n\\nnature of services sought: engineering design services to undertake a\\ncomprehensive plan for staten island's east shore parks\\nstart date: 6/1/2015\\nend date: 3/1/2016\\nmethod of solicitation the agency intends to utilize: task order\\npersonnel in substantially similar titles within agency:\\ncivil engineer, assistant civil engineer, project manager, associate project\\nmanager, landscape architect, assistant landscape architect, landscape\\narchitect intern\", 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "nan\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('senior network\\narchitect\\n\\nheadcount of personnel in substantially similar titles within agency: 2\\nagency: department of information technology and telecommunications\\n\\ndescription of services sought: establish a logging and recording (l&r)\\nsystem for both nypd and fdny at psac2, including the design, implementation,\\ninstallation and testing.\\n\\nstart date of the proposed contract: 4/1/2015\\n\\nend date of the proposed contract: 3/31/2018\\n\\nmethod of solicitation the agency intends to utilize: negotiated acquisition\\n\\npersonnel in substantially similar titles within agency: infrastructure\\nproject manager, infrastructure architect, infrastructure engineer\\n\\nheadcount of personnel in substantially similar titles within agency: 22\\nnotice of intent to extend contract(s) not included in fy 2015 annual\\ncontracting plan and schedule\\nnotice is hereby given that the mayor will be entering into the following extension(s) of (a) contract(s) not included in the fy 2015 annual contracting plan and schedule that is published pursuant to new york city charter § 312(a): \\nagency: department of information technology & telecommunications\\n\\nfms contract #: cta1-858- 20147204172\\n\\nvendor: comsys it services llc\\n\\ndescription of services:\\n\\nprovide subject matter expertise in the development of the application\\nrequirements along with the design, build and deployment of the application in\\nall environments.\\n\\ndevelop the approach for the future business requirements, plan, design, build\\nand deploy future enhancements and extensions to the existing system.\\n\\ndesign and develop the documentum platform as an enterprise wide content\\nmanagement system that can be extended to various city agencies on a shared\\nservices model.\\n\\nreport product issues and work with emc (documentum vendor) to resolve the\\nissues and test all fixes and system patch releases provided by the vendor.\\n\\naward method of original contract: intergovernmental\\n\\nfms contract type: itcs negotiated acquisition 2\\n\\nend date of original contract: 12/31/14\\n\\nmethod of renewal/extension the agency intends to utilize: extension\\n\\nnew start date of the proposed renewed/extended contract: 01/01/15\\n\\nnew end date of the proposed renewed/extended contract: 06/30/15\\n\\nmodifications sought to the nature of services performed under the contract:\\nnone\\n\\nreason(s) the agency intends to renew/extend the contract: continuation of\\nservices.\\n\\npersonnel in substantially similar titles within agency: none\\n', 'senior network\\narchitect\\n\\nheadcount of personnel in substantially similar titles within agency: 2\\nagency: department of information technology and telecommunications\\n\\ndescription of services sought: establish a logging and recording (l&r)\\nsystem for both nypd and fdny at psac2, including the design, implementation,\\ninstallation and testing.\\n\\nstart date of the proposed contract: 4/1/2015\\n\\nend date of the proposed contract: 3/31/2018\\n\\nmethod of solicitation the agency intends to utilize: negotiated acquisition\\n\\npersonnel in substantially similar titles within agency: infrastructure\\nproject manager, infrastructure architect, infrastructure engineer\\n\\nheadcount of personnel in substantially similar titles within agency: 22\\nnotice of intent to extend contract(s) not included in fy 2015 annual\\ncontracting plan and schedule\\nnotice is hereby given that the mayor will be entering into the following extension(s) of (a) contract(s) not included in the fy 2015 annual contracting plan and schedule that is published pursuant to new york city charter § 312(a): \\nagency: department of information technology & telecommunications\\n\\nfms contract #: cta1-858- 20147204172\\n\\nvendor: comsys it services llc\\n\\ndescription of services:\\n\\nprovide subject matter expertise in the development of the application\\nrequirements along with the design, build and deployment of the application in\\nall environments.\\n\\ndevelop the approach for the future business requirements, plan, design, build\\nand deploy future enhancements and extensions to the existing system.\\n\\ndesign and develop the documentum platform as an enterprise wide content\\nmanagement system that can be extended to various city agencies on a shared\\nservices model.\\n\\nreport product issues and work with emc (documentum vendor) to resolve the\\nissues and test all fixes and system patch releases provided by the vendor.\\n\\naward method of original contract: intergovernmental\\n\\nfms contract type: itcs negotiated acquisition 2\\n\\nend date of original contract: 12/31/14\\n\\nmethod of renewal/extension the agency intends to utilize: extension\\n\\nnew start date of the proposed renewed/extended contract: 01/01/15\\n\\nnew end date of the proposed renewed/extended contract: 06/30/15\\n\\nmodifications sought to the nature of services performed under the contract:\\nnone\\n\\nreason(s) the agency intends to renew/extend the contract: continuation of\\nservices.\\n\\npersonnel in substantially similar titles within agency: none\\n', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none ', 'none ', 'headcount')]\n", "--------\n", "\n", "****personnel in substantially similar titles within agency == [('none', 'none', '')]\n", "--------\n", "\n", "4 out of 67 entries not parsed\n", "They are...\n", "public notice is hereby given that the \n", "\n", " franchise and concession review committee will hold a \n", "\n", " public meeting on wednesday\n", "---\n", "\n", "\n", "please see attached document.\n", "---\n", "\n", "\n", "nan\n", "---\n", "\n", "\n", "nan\n", "---\n", "\n", "\n" ] } ], "source": [ "# Iterate over the data and see if we can scrape\n", "#\n", "unparsed_count = 0\n", "unparsed = []\n", "moc_ads = mocs_ads['scrape'].values\n", "for ad in moc_ads:\n", " # remove multiple spaces and newlines\n", " txt = re.sub(' +', ' ', ad)\n", " txt = re.sub('\\n{3,}', '\\n', txt)\n", " match = rex_time.findall(txt)\n", " if match:\n", " print('Extracted: {}'.format(match))\n", " else:\n", " # dispatch parsing to appropriate collection of regex\n", " if 'notice of intent to extend contract(s)' in txt or \\\n", " 'notice of intent to issue new solicitation' in txt:\n", " out = {}\n", " for k in extend_contract_keys:\n", " match = rex_extend_contract[k].findall(txt)\n", " if len(match) and isinstance(match[0], str):\n", " match = [match[0].strip()]\n", " if len(match):\n", " out[k] = match[0]\n", " if len(out.keys()):\n", "# print('Parsed...{0}'.format(txt))\n", " xxx = 'personnel in substantially similar titles within agency'\n", " print ('****{} == [{}]'.format(xxx, out[xxx]))\n", " \n", " \n", " else:\n", " print(txt)\n", " unparsed_count += 1\n", " unparsed.append(txt)\n", " print('--------\\n')\n", " \n", "print ('{} out of {} entries not parsed\\nThey are...'.format(unparsed_count, len(moc_ads)))\n", "for t in unparsed:\n", " print ('{}\\n---\\n\\n'.format(t))\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }