import re
import os
#here we go through all our scrapped webpages and find what we actually want. In this instance, the code wants information on the following: Attracts, Bloom Description, Bloom Time, Common Name, Culture, Family, Flower, Formal Name, Fruit, Garden Uses, Height, Invasive, Leaf, Maintenance, Native Range, Noteworthy Characteristics, Other, Problems, Spread, Suggested Use, Sun, Tolerate, Type, Water, and Zone
if __name__ == "__main__":
path = "mobot_entries/scraped_results/"
g = open("mobot_entries/cleaner_results/plants_mobot.csv", 'w')
g.write("Attracts,Bloom Description,Bloom Time,Common Name,Culture,Family,Flower,Formal Name,Fruit,Garden Uses,Height,Invasive,Leaf,Maintenance,Native Range,Noteworthy Characteristics,Other,Problems,Spread,Suggested Use,Sun,Tolerate,Type,Water,Zone" + "\n")
for filename in os.listdir(path):
with open(path + filename) as f:
for i, line in enumerate(f):
cleaner_line = line.strip(", ").strip(" ").strip('
').strip("
").strip("\n").strip("\t").strip(" ").strip(" ").strip(" ").strip(" ")
cleaner_line_list = cleaner_line.split("<")
content = cleaner_line_list[0]
pattern = re.compile(r"[^A-Za-z0-9 :]+")
content = pattern.sub("", content)
if i == 1: formalName = content
if "Attracts" in line:
if ":" in line:
attractsText_list = content.split(":")
attractsText = attractsText_list[1]
if "Bloom Description" in line:
if ":" in line:
colorText_list = content.split(":")
colorText = colorText_list[1]
else:
colorText = str(content)
if "Bloom Time" in line:
if ":" in line:
bloomTime_list = content.split(":")
bloomTime = bloomTime_list[1]
else:
bloomTime - str(content)
if "Common Name" in line:
if ":" in line:
commonName_list = content.split(":")
commonName = commonName_list[1]
else:
commonName = str(content)
if "Culture" in line:
if ":" in line:
culture_list = content.split(":")
culture = culture_list[1]
else:
culture = str(content)
if "Family" in line:
if ":" in line:
family_list = content.split(":")
family = family_list[1]
else:
family = str(content)
if "Flower" in line:
if ":" in line:
flower_list = content.split(":")
flower = flower_list[1]
else:
flower = str(content)
if "Fruit" in line:
if ":" in line:
fruit_list = content.split(":")
fruit = fruit_list[1]
else:
fruit = str(content)
if "Garden Uses" in line:
if ":" in line:
gardenUses_list = content.split(":")
gardenUses = gardenUses_list[1]
else:
gardenUses = str(content)
if "Height" in line:
if ":" in line:
height_list = content.split(":")
height = height_list[1]
else:
height = str(content)
if "Invasive" in line:
if ":" in line:
invasive_list = content.split(":")
invasive = invasive_list[1]
else:
invasive = str(content)
if "Leaf" in line:
if ":" in line:
leaf_list = content.split(":")
leaf = leaf_list[1]
else:
leaf = str(content)
if "Maintenance" in line:
if ":" in line:
maintenance_list = content.split(":")
maintenance = maintenance_list[1]
else:
maintenance = str(content)
if "Native Range" in line:
if ":" in line:
nativeRange_list = content.split(":")
nativeRange = nativeRange_list[1]
else:
nativeRange = str(content)
if "Noteworthy Characteristics" in line:
if ":" in line:
noteworthyCharacteristics_list = content.split(":")
noteworthyCharacteristics = noteworthyCharacteristics_list[1]
else:
noteworthyCharacteristics = str(content)
if "Other" in line:
if ":" in line:
other_list = content.split(":")
other = other_list[1]
else:
other = str(content)
if "Problems" in line:
if ":" in line:
problems_list = content.split(":")
problems = problems_list[1]
else:
problems = str(content)
if "Spread" in line:
if ":" in line:
spread_list = content.split(":")
spread = spread_list[1]
else:
spread = str(content)
if "Suggested Use" in line:
if ":" in line:
suggested_use_list = content.split(":")
suggested_use = suggested_use_list[1]
else:
suggested_use = str(content)
if "Sun" in line:
if ":" in line:
sun_list = content.split(":")
sun = sun_list[1]
else:
sun = str(content)
if "Tolerate" in line:
if ":" in line:
tolerate_list = content.split(":")
tolerate = tolerate_list[1]
else:
tolerate = str(content)
if "Type" in line:
if ":" in line:
type_plant_list = content.split(":")
type_plant = type_plant_list[1]
else:
type_plant = str(content)
if "Water" in line:
if ":" in line:
water_list = content.split(":")
water = water_list[1]
else:
water = str(content)
if "Zone" in line:
if ":" in line:
zone_list = content.split(":")
zone = zone_list[1]
else:
zone = str(content)
try:
attractsText
except NameError:
attractsText = "NA"
else:
continue
try:
colorText
except NameError:
colorText = "NA"
else:
continue
try:
bloomTime
except NameError:
bloomTime = "NA"
else:
continue
try:
commonName
except NameError:
commonName = "NA"
else:
continue
try:
culture
except NameError:
culture = "NA"
else:
continue
try:
family
except NameError:
family = "NA"
else:
continue
try:
flower
except NameError:
flower = "NA"
else:
continue
try:
fruit
except NameError:
fruit = "NA"
else:
continue
try:
gardenUses
except NameError:
gardenUses = "NA"
else:
continue
try:
height
except NameError:
height = "NA"
else:
continue
try:
invasive
except NameError:
invasive = "NA"
else:
continue
try:
leaf
except NameError:
leaf = "NA"
else:
continue
try:
maintenance
except NameError:
maintenance = "NA"
else:
continue
try:
nativeRange
except NameError:
nativeRange = "NA"
else:
continue
try:
noteworthyCharacteristics
except NameError:
noteworthyCharacteristics = "NA"
else:
continue
try:
other
except NameError:
other = "NA"
else:
continue
try:
problems
except NameError:
problems = "NA"
else:
continue
try:
spread
except NameError:
spread = "NA"
else:
continue
try:
suggested_use
except NameError:
suggested_use = "NA"
else:
continue
try:
sun
except NameError:
sun = "NA"
else:
continue
try:
tolerate
except NameError:
tolerate = "NA"
else:
continue
try:
type_plant
except NameError:
type_plant = "NA"
else:
continue
try:
water
except NameError:
water = "NA"
else:
continue
try:
zone
except NameError:
zone = "NA"
else:
continue
plant_characteristics = [attractsText,colorText, bloomTime, commonName, culture, family, flower, formalName, fruit, gardenUses, height, invasive, leaf, maintenance, nativeRange, noteworthyCharacteristics, other, problems, spread, suggested_use, sun, tolerate, type_plant, water, zone]
g.write(",".join(plant_characteristics) + "\n")
print("Finished up " + formalName)
g.close()
print("Done!")