{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pip install bs4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pip install html5lib" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver \n", "from selenium.common.exceptions import NoSuchElementException\n", "from selenium.webdriver.chrome.options import Options\n", "from selenium.webdriver.common.by import By\n", "from bs4 import BeautifulSoup\n", "import time\n", "import requests" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "headers = {'User-Agent': \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36\"}\n", "#driver = webdriver.Chrome()\n", "#driver.get(URL)\n", "#time.sleep(3)\n", "#html = driver.page_source\n", "#soup = BeautifulSoup(html, 'html.parser')\n", "#r = requests.get(url=URL, headers=headers)\n", "#driver.switch_to.frame(\"sp_message_iframe_764224\") \n", "#element = driver.find_element(By.XPATH, '/html/body/div/div[2]/div[3]/div[2]/button').click()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "element = driver.find_element(by=By.XPATH, value= '//*[title=\"ACCEPT ALL]' )\n", "element.click()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "players_list = []\n", "age_list = []\n", "position_list = []\n", "value_list = []\n", "badge_list = []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for pagenum in range(1, 5):\n", " URL = \"https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/\" +str(pagenum)\n", " r = requests.get(url=URL, headers=headers)\n", " soup = BeautifulSoup(r.content, 'html.parser')\n", "\n", " players = soup.find_all(\"td\", class_=\"hauptlink\")\n", " ages = soup.find_all(\"td\", class_=\"zentriert\")\n", " nationality = soup.find_all(\"td\", class_=\"zentriert\")\n", " values = soup.find_all(\"td\", class_=\"rechts hauptlink\")\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for pagenum in range(1, 5):\n", " URL = \"https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/\" + str(pagenum)\n", " r = requests.get(url=URL, headers=headers)\n", " soup = BeautifulSoup(r.content, 'html.parser')\n", "\n", " players = soup.find_all(\"td\", class_=\"hauptlink\")\n", " for player in players:\n", " players_list.append(player.text)\n", "\n", " ages = soup.find_all(\"td\", class_=\"zentriert\")\n", " for age in ages:\n", " age_list.append(age.text)\n", " \n", " nationalities = soup.find_all(\"td\", class_=\"zentriert\")\n", " for nation in nationality:\n", " flags = nation.find_all('img') \n", " if flags: \n", " first_flag = flags[0] \n", " first_nationality = first_flag['title']\n", " nationality_list.append(first_nationality)\n", " \n", " values = soup.find_all(\"td\", class_=\"rechts hauptlink\")\n", " for value in values:\n", " value_list.append(value.text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "badges = soup.find_all(\"td\", class_=\"zentriert\")\n", "for badge in badges:\n", " flags = badge.find_all('img') \n", " if flags: \n", " first_flag = flags[0] \n", " first_badge = first_flag['title']\n", " badge_list.append(first_badge)\n", " print(first_nationality)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nationality_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "players" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for player in players:\n", " players_list.append(player.text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "players_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "i = 1\n", "while i < len(players_list):\n", " del players_list[i]\n", " i += 1\n", "print(players_list)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ages = soup.find_all(\"td\", class_=\"zentriert\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ages" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for age in ages:\n", " age_list.append(age.text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "age_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_age_list = []\n", "for i in range(2, len(age_list), 4):\n", " new_age_list.append(age_list[i])\n", "print(new_age_list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "new_age_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nationality = soup.find_all(\"td\", class_=\"zentriert\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nationality" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for pagenum in range(1, 5):\n", " URL = \"https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/\" + str(pagenum)\n", " r = requests.get(url=URL, headers=headers)\n", " soup = BeautifulSoup(r.content, 'html.parser')\n", " table = soup.find(\"table\", class_='items')\n", " tbody = table.find(\"tbody\")\n", " badges = tbody.find_all(\"td\", class_=\"zentriert\")\n", " for badge in badges:\n", " flags = badge.find_all('img') \n", " if flags: \n", " first_flag = flags[0] \n", " first_badge = first_flag['title']\n", " badge_list.append(first_badge)\n", " print(first_badge)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "badge_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "del badge_list[-5:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "indexes_to_delete = [219, 218, 217, 216, 215, 164, 163, 162, 161, 160, 109, 108, 107, 106, 105, 54, 53, 52, 51, 50]\n", "\n", "for index in indexes_to_delete:\n", " del badge_list[index]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "badge_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nationality_list = badge_list[::2]\n", "club_list = badge_list[1::2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nationality_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "club_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for nation in nationality:\n", " flags = nation.find('img') \n", " if flags: \n", " first_flag = flags['0'] \n", " first_nationality = first_flag['title']\n", " nationality_list.append(first_nationality)\n", " print(first_nationality)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "i = 1\n", "while i < len(nationality_list):\n", " del nationality_list[i]\n", " i += 1\n", "print(nationality_list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "values = soup.find_all(\"td\", class_=\"rechts hauptlink\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for value in values:\n", " value_list.append(value.text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "value_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_players = []\n", "\n", "for pagenum in range(1, 5):\n", " URL = \"https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/\" + str(pagenum)\n", " r = requests.get(url=URL, headers=headers)\n", " soup = BeautifulSoup(r.content, 'html.parser')\n", " table = soup.find(\"table\", class_='items')\n", " tbody = table.find(\"tbody\")\n", " rows = tbody.find_all(\"tr\")\n", "\n", " for row in rows:\n", " cols = row.find_all('td')\n", " cols = [col.text.strip() for col in cols]\n", " all_players.append(cols)\n", "\n", " print(all_players)\n", " print(len(all_players))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_players" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "final_players = [all_players[i] for i in range(len(all_players)) if i % 3 == 0]\n", "\n", "print(final_players)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "final_players[2]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "df = pd.DataFrame(final_players, columns=['Rank', 'null', 'null1', 'Player', 'Position', 'null3', 'Age', 'null4', 'Value'])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['Nationality'] = nationality_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df['Club'] = club_list" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'\n", "URL_TEMPLATE = 'https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/{}'\n", "headers = {'User-Agent': USER_AGENT}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def scrape_webpage(url):\n", " response = requests.get(url=url, headers=headers)\n", " return BeautifulSoup(response.content, 'html.parser')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_players = []\n", "for pagenum in range(1, 5):\n", " url = URL_TEMPLATE.format(pagenum)\n", " soup = scrape_webpage(url)\n", " table = soup.find(\"table\", class_='items')\n", " tbody = table.find(\"tbody\")\n", " rows = tbody.find_all(\"tr\")\n", " for row in rows:\n", " cols = row.find_all('td')\n", " cols = [col.text.strip() for col in cols]\n", " all_players.append(cols)\n", "final_players = [all_players[i] for i in range(len(all_players)) if i % 3 == 0]\n", "print(final_players)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_players = []\n", "for pagenum in range(1, 5):\n", " URL = \"https://www.transfermarkt.co.uk/premier-league/marktwerte/wettbewerb/GB1/page/\" + str(pagenum)\n", " r = requests.get(url=URL, headers=headers)\n", " soup = BeautifulSoup(r.content, 'html.parser')\n", " table = soup.find(\"table\", class_='items')\n", " tbody = table.find(\"tbody\")\n", " rows = tbody.find_all(\"tr\")\n", "\n", " for row in rows:\n", " cols = row.find_all('td')\n", " cols = [col.text.strip() for col in cols]\n", " all_players.append(cols)\n", "\n", "final_players = [all_players[i] for i in range(len(all_players)) if i % 3 == 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }