python - Creating Large DataFrame from smaller DataFrames -
i having issue structure of data off pga website. have trouble putting data dataframe , merging data can use dataframe analysis later. dimensions of scraped data never right. separate error each time run code cant seem reconcile.
i have tried merging , concatenating dataframes nothing seems work. appreciated
i dataframe contain individual statistics separate sites on same row other data formatted year , player name.
import csv urllib.request import urlopen bs4 import beautifulsoup import datetime import socket import urllib.error import pandas pd import urllib import sqlalchemy import numpy np import functools base = 'http://www.pgatour.com/' inn = 'stats/stat' end = '.html' years = ['2017','2016'] alpha = [] #all pages links tables urls = ['http://www.pgatour.com/stats.html','http://www.pgatour.com/stats/categories.rott_inq.html','http://www.pgatour.com/stats/categories.rapp_inq.html','http://www.pgatour.com/stats/categories.rarg_inq.html','http://www.pgatour.com/stats/categories.rput_inq.html','http://www.pgatour.com/stats/categories.rscr_inq.html','http://www.pgatour.com/stats/categories.rstr_inq.html','http://www.pgatour.com/stats/categories.rmny_inq.html','http://www.pgatour.com/stats/categories.rpts_inq.html'] in urls: data = urlopen(i) soup = beautifulsoup(data, "html.parser") link in soup.find_all('a'): if link.has_attr('href'): alpha.append(base + link['href'][17:]) #may need adjusting #data links beta = [] in alpha: if inn in i: beta.append(i) gamma = [] in beta: if not in gamma: gamma.append(i) jan = [] in gamma: try: data = urlopen(i) soup = beautifulsoup(data, "html.parser") table in soup.find_all('section',{'class':'module-statistics-off-the-tee-details'}): j in table.find_all('h3'): y=j.get_text().replace(" ","").replace("-","").replace(":","").replace(">","").replace("<","").replace(">","").replace(")","").replace("(","").replace("=","").replace("+","") jan.append([i,str(y+'.csv')]) print([i,str(y+'.csv')]) except exception e: print(e) pass #my problem starts here #using urls list can find error faster urls = [['http://www.pgatour.com/stats/stat.02356.html','d'] ,['http://www.pgatour.com/stats/stat.02568.html','f'] ,['http://www.pgatour.com/stats/stat.111.html','r']] list = [] master = pd.dataframe() #jan = [['http://www.pgatour.com/stats/stat.02356.html', 'last15eventsscoring.csv']] #make list url , title name , cleaned csv name #write csv row_sp = [] rows_sp =[] title1 = [] title = [] in urls: try: y in years: data = urlopen(i[0][:-4] +y+ end) soup = beautifulsoup(data, "html.parser") data1 = urlopen(i[0]) soup1 = beautifulsoup(data1, "html.parser") table in soup1.find_all('table',{'id':'statstable'}): title.append('year') k in table.find_all('tr'): n in k.find_all('th'): title1.append(n.get_text()) l in title1: if l not in title: title.append(l) rows_sp.append(title) table in soup.find_all('table',{'id':'statstable'}): h in table.find_all('tr'): row_sp = [y] j in h.find_all('td'): row_sp.append(j.get_text().replace(" ","").replace("\n","").replace("\xa0"," ")) rows_sp.append(row_sp) df=pd.dataframe(rows_sp) df.columns = title df.drop(df.index[1],inplace = true) print(df) list.append(df) except exception e: print(e) pass df_merge = functools.reduce(lambda left,right: pd.merge(left,right,on=['year','player name'], how='outer'), list)
Comments
Post a Comment