{ "cells": [ { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "#import bread & butter\n", "import itertools\n", "import pandas as pd\n", "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def get_data(roleName):\n", " '''\n", " Get H1B info for a given rolename\n", " \n", " Params: roleName -> Role to retrive the data for\n", " Output: Dataframe in format ['Employer', 'Title', 'Salary', 'Location', 'Start Date']\n", " '''\n", " \n", " #build url from role name\n", " url='https://h1bdata.info/index.php?em=&job='+roleName.replace(\" \",\"+\")+'&city=&year=All+Years'\n", " print(url)\n", " \n", " #make soup\n", " page = requests.get(url)\n", " soup = BeautifulSoup(page.text, 'html.parser')\n", " page.close()\n", " \n", " tables = soup.find_all('table')\n", " \n", " #find relevant info\n", " args = [iter(tables[0].stripped_strings)] * 7\n", " data_list = list(itertools.zip_longest(*args))\n", " data_list = [x for x in data_list if x[6] == 'CERTIFIED']\n", " \n", " data = pd.DataFrame(data_list)\n", " del data[4], data[6]\n", " data.columns = ['Employer', 'Title', 'Salary', 'Location', 'Start Date']\n", " data['Salary']=pd.to_numeric(data['Salary'],errors='ignore')\n", " data['Start Date']=pd.to_datetime(data['Start Date'],errors='ignore')\n", " \n", " return data" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data Scie\n", "https://h1bdata.info/index.php?em=&job=Data+Scie&city=&year=All+Years\n", "Associate Data Scie\n", "https://h1bdata.info/index.php?em=&job=Associate+Data+Scie&city=&year=All+Years\n", "Junior Data Scie\n", "https://h1bdata.info/index.php?em=&job=Junior+Data+Scie&city=&year=All+Years\n", "Senior Data Scie\n", "https://h1bdata.info/index.php?em=&job=Senior+Data+Scie&city=&year=All+Years\n", "Sr Data Scien\n", "https://h1bdata.info/index.php?em=&job=Sr+Data+Scien&city=&year=All+Years\n", "Sr. Data Scien\n", "https://h1bdata.info/index.php?em=&job=Sr.+Data+Scien&city=&year=All+Years\n", "Lead Data Scien\n", "https://h1bdata.info/index.php?em=&job=Lead+Data+Scien&city=&year=All+Years\n", "Principal Data Scien\n", "https://h1bdata.info/index.php?em=&job=Principal+Data+Scien&city=&year=All+Years\n", "Jr Data Scien\n", "https://h1bdata.info/index.php?em=&job=Jr+Data+Scien&city=&year=All+Years\n", "Jr. Data Scien\n", "https://h1bdata.info/index.php?em=&job=Jr.+Data+Scien&city=&year=All+Years\n", "Graduate Data Scien\n", "https://h1bdata.info/index.php?em=&job=Graduate+Data+Scien&city=&year=All+Years\n", "ML\n", "https://h1bdata.info/index.php?em=&job=ML&city=&year=All+Years\n", "Machine lea\n", "https://h1bdata.info/index.php?em=&job=Machine+lea&city=&year=All+Years\n", "Applied Data Scientist\n", "https://h1bdata.info/index.php?em=&job=Applied+Data+Scientist&city=&year=All+Years\n" ] } ], "source": [ "final_df=pd.DataFrame() #Result collector Dataframe\n", "\n", "#Variations of 'Data Scientist'\n", "roleName_List=['Data Scie','Associate Data Scie','Junior Data Scie','Senior Data Scie','Sr Data Scien','Sr. Data Scien',\n", " 'Lead Data Scien','Principal Data Scien','Jr Data Scien','Jr. Data Scien','Graduate Data Scien',\n", " 'ML','Machine lea','Applied Data Scientist']\n", "\n", "\n", "for role in roleName_List:\n", " \n", " print(role)\n", "\n", " data=get_data(role)\n", " final_df=pd.concat([final_df,data])\n", " " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "final_df.to_csv('FINAL_ALL_H1B_Certified_Data_Scienitist.csv',index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.6" } }, "nbformat": 4, "nbformat_minor": 2 }