from bs4 import BeautifulSoup import time from datetime import date, datetime, timedelta import cloudscraper import numpy as np import pandas as pd import os import hashlib import argparse import re #import httpx #import cfscrape # Default Query parameter MARKET = 'residential' TYPE = 'condo' STATE = 'kl' ### CODE STARTS HERE ### property_type = {'all':'', 'bungalow':'&property_type_code%5B%5D=BUNG&property_type_code%5B%5D=LBUNG&property_type_code%5B%5D=ZBUNG&property_type_code%5B%5D=TWINV&property_type_code%5B%5D=TWINC&property_type=B', 'condo':'&property_type_code%5B%5D=CONDO&property_type_code%5B%5D=APT&property_type_code%5B%5D=FLAT&property_type_code%5B%5D=PENT&property_type_code%5B%5D=SRES&property_type_code%5B%5D=STDIO&property_type_code%5B%5D=DUPLX&property_type_code%5B%5D=TOWNC&property_type=N', 'semid':'&property_type_code%5B%5D=SEMI&property_type_code%5B%5D=CLUS&property_type=S', 'terrace':'&property_type_code%5B%5D=TERRA&property_type_code%5B%5D=TOWN&property_type_code%5B%5D=TER1&property_type_code%5B%5D=TER15&property_type_code%5B%5D=TER2&property_type_code%5B%5D=TER25&property_type_code%5B%5D=TER3&property_type_code%5B%5D=TER35&property_type=T', 'land':'&property_type_code%5B%5D=RLAND&property_type=L'} state = {'all':'', 'johor':'®ion_code=MY01', 'kedah':'®ion_code=MY02', 'kelantan':'®ion_code=MY03', 'melaka':'®ion_code=MY04', 'ns':'®ion_code=MY05', 'pahang':'®ion_code=MY06', 'penang':'®ion_code=MY07', 'perak':'®ion_code=MY08', 'perlis':'®ion_code=MY09', 'selangor':'®ion_code=MY10', 'terengganu':'®ion_code=MY11', 'sabah':'®ion_code=MY12', 'sarawak':'®ion_code=MY13', 'kl':'®ion_code=MY14', 'labuan':'®ion_code=MY15', 'putrajaya':'®ion_code=MY16', 'other':'®ion_code=MY99'} def BSPrep(URL): exitcode = 1 while exitcode == 1: try: trial = 0 while trial < 10: scraper = cloudscraper.create_scraper() #client = httpx.Client(http2=True) print('Loading '+URL) s = scraper.get(URL) soup = BeautifulSoup(s.content, 'html.parser') if "captcha" in soup.text: trial += 1 print('Retrying '+' ('+str(trial)+'/10) ...') time.sleep(0.1) continue elif "No Results" in soup.text: print('Invalid URL, skipping '+URL) trial = 99 else: trial = 99 if trial == 10: print('Trial exceeded, skipping '+URL) exitcode = 0 return soup except: print('Connection reset, retrying in 1 mins...', flush=True) time.sleep(60) def Pagination(soup): pagination = soup.find("ul", class_="pagination") try: if pagination.find_all("li", class_="pagination-next disabled"): pages = int(pagination.find_all("a")[0]['data-page']) else: pages = int(pagination.find_all("a")[-2]['data-page']) except AttributeError: if soup.find("h1", class_="title search-title").text.split(' ')[2] == '0': print('No property found. Scraping stopped.') exit(0) else: exit(1) return pages def LinkScraper(soup): links = [] units = soup.find_all("div", itemtype="https://schema.org/Place") for unit in units: if unit.find("a", class_="btn btn-primary-outline units_for_sale disabled") and unit.find("a", class_="btn btn-primary-outline units_for_rent disabled"): continue prop = unit.find("a", class_="nav-link") links.append((prop['title'],HEADER+prop["href"])) return(links) def InfoExtract(pname, soup, key): i = -1 if 'sale' in key else 0 type = 'Sale' if 'sale' in key else 'Rent' for property in soup.find_all(itemtype="https://schema.org/Place"): listid = property["data-listing-id"] try: bed = property.find('span', class_="bed").text.strip() bath = property.find('span', class_="bath").text.strip() except AttributeError: try: bed = property.find("li", class_="listing-rooms pull-left").text.strip() bath = property.find("li", class_="listing-rooms pull-left").text.strip() except AttributeError: bed = np.nan bath = np.nan try: price = float(property.find("span", class_="price").text.split(' ')[i].replace(',','').strip()) except AttributeError: price = np.nan except ValueError: price = property.find("span", class_="price").text.split(' ')[i].replace(',','').strip() try: sqft = int(property.find("li", class_="listing-floorarea pull-left").text.split(' ')[0]) except AttributeError: sqft = np.nan except ValueError: sqft = np.nan try: author = property.find('span', class_='name').text except AttributeError: author = np.nan posted = property.find("i", class_="pgicon pgicon-clock-o").text if posted[-1] == "h": listtime = str(datetime.now()-timedelta(hours=int(posted[0:-1]))) elif posted[-1] == "d": listtime = str(datetime.now()-timedelta(days=int(posted[0:-1]))) page_listing = [listid, pname, type, price, bed, bath, sqft, author, listtime] return page_listing def PropScrapper(pname, plink, key): prop_listing = [] soup = BSPrep(plink.replace('/condo/', key)) pid = re.search(r'\d+$', plink).group(0) title = soup.find('h1', class_='title search-title text-transform-none') total = title['title'].split(' ')[0] prop_listing.append(InfoExtract(pname, soup, key)+pid) if total != 'No' and int(total) > 20: for page in range(2, int(total)//20+2): soup = BSPrep(plink.replace('/condo/', key)+'/'+str(page)) prop_listing.append(InfoExtract(pname, soup, key)+pid) return prop_listing def md5hash(datafile, hashfile): h = hashlib.md5() with open(datafile,'rb') as file: chunk = 0 while chunk != b'': chunk = file.read(1024) h.update(chunk) with open(hashfile, 'w') as f: f.write(h.hexdigest()) print('MD5 hash generated to '+hashfile) def PropTrimmer(props, datafile): df_old = pd.read_csv(datafile) prop, link = zip(*props) len_old_props = len(prop) last_prop_name = df_old.PropertyName.iat[-1] prop_index = prop.index(last_prop_name) props = props[prop_index:] print('This is a re-run.\nSkipping {} properties scraped previously.'.format(len_old_props-len(props))) return props, last_prop_name def argparser(): parser = argparse.ArgumentParser() try: parser.add_argument('-m', '--market', default=MARKET, dest='Market', help='eg. Residential, Commercial etc. (default: Condo)') parser.add_argument('-t', '--type', default=TYPE, dest='Type', help='eg. Condo, Terrace, etc. (default: condo)') parser.add_argument('-s', '--state', default=STATE, dest='State', help='eg. KL, Selangor, Johor etc. (default: KL)') args = parser.parse_args() return args except: parser.print_help() exit() def main(): # Load first page with Query and scrape no. of pages print('\n===================================================\nPropertyGuru Property Listing Scraper v2.4-alpha\nAuthor: DicksonC\n===================================================\n') time.sleep(2) print('Job initiated with query on {} in {}.'.format(TYPE, STATE)) print('\nLoading '+HEADER+KEY+QUERY+' ...\n') soup = BSPrep(HEADER+KEY+QUERY) pages = Pagination(soup) print(str(pages)+' page will be scrapped.\n') # Scrape links from first page for properties with both sale and rental listing props = [] props += LinkScraper(soup) print('\rPage 1/{} done.'.format(str(pages))) # Scrape subsequent pages for page in range(2, pages+1): soup = BSPrep(HEADER+KEY+'/'+str(page)+QUERY) props += LinkScraper(soup) print('\rPage {}/{} done.'.format(str(page), str(pages))) # Check exising data and remove scraped links if os.path.exists(RAW_LISTING): try: props, last_prop_name = PropTrimmer(props, RAW_LISTING) error_flag = False except ValueError or IndexError: print("EOF does not match. Scraping starts from the beginning.") error_flag = True # Scrape details for sale and rental of each properties data = [] print('\nA total of '+str(len(props))+' properties will be scraped.\n') try: for i, prop in enumerate(props): sale = PropScrapper(*prop, '/property-for-sale/at-') rent = PropScrapper(*prop, '/property-for-rent/at-') print(str(i+1)+'/'+str(len(props))+' done!') data += sale data += rent # Result into DataFrame and Analysis df = pd.DataFrame(data, columns=['ListID','PropertyName','Type','Price','Bedrooms','Bathrooms','Sqft','Author','ListDate','PropertyID']) # Check if data directory exists if not os.path.isdir(LIST_DIR): os.makedirs(LIST_DIR) # Check exising data and combine if os.path.exists(RAW_LISTING): if not error_flag: df_old = pd.read_csv(RAW_LISTING) df_old = df_old[df_old.PropertyName!=last_prop_name] df = pd.concat([df_old, df]) # Raw data saved to file df.to_csv(RAW_LISTING, index=False) print('Raw data saved to {}'.format(RAW_LISTING)) except: print('Error encountered! Exporting current data ...') # Result into DataFrame and Analysis df = pd.DataFrame(data, columns=['PropertyName','Type','Price','Bedrooms','Bathrooms','Sqft','Author']) # Check if data directory exists if not os.path.isdir(LIST_DIR): os.makedirs(LIST_DIR) # Raw data saved to file df.to_csv(RAW_LISTING, index=False) print('INCOMPLETE raw data saved to {}'.format(RAW_LISTING)) exit(1) else: # Check if hash directory exists if not os.path.isdir(HASH_DIR): os.makedirs(HASH_DIR) md5hash(RAW_LISTING, MD5HASH) if __name__ == "__main__": # Initialize arguments args = argparser() MARKET, TYPE, STATE= args.Market, args.Type, args.State # Initialize filenames (leave empty if not generating) LIST_DIR = './data/{}'.format(date.today().strftime("%b%Y")) HASH_DIR = './md5hash/{}'.format(date.today().strftime("%b%Y")) RAW_LISTING = './data/{}/{}-{}-{}-listing.csv'.format(date.today().strftime("%b%Y"),TYPE,STATE,date.today().strftime("%b%Y")) MD5HASH = './md5hash/{}/{}-{}-{}-listing.md5'.format(date.today().strftime("%b%Y"),TYPE,STATE,date.today().strftime("%b%Y")) # Initialize URL HEADER = 'https://www.propertyguru.com.my' KEY = '/condo/search-project' if STATE.lower() == 'other': QUERY = '?limit=200&market='+MARKET.lower()+property_type[TYPE.lower()]+state[STATE.lower()]+'&newProject=all' else: QUERY = '?limit=500&market='+MARKET.lower()+property_type[TYPE.lower()]+state[STATE.lower()]+'&newProject=all' main()