python - Creating Large DataFrame from smaller DataFrames -


i having issue structure of data off pga website. have trouble putting data dataframe , merging data can use dataframe analysis later. dimensions of scraped data never right. separate error each time run code cant seem reconcile.

i have tried merging , concatenating dataframes nothing seems work. appreciated

i dataframe contain individual statistics separate sites on same row other data formatted year , player name.

import csv urllib.request import urlopen bs4 import beautifulsoup import datetime import socket import urllib.error import pandas pd import urllib import sqlalchemy import numpy np import functools  base = 'http://www.pgatour.com/' inn = 'stats/stat' end = '.html' years = ['2017','2016']   alpha = [] #all pages links tables urls =     ['http://www.pgatour.com/stats.html','http://www.pgatour.com/stats/categories.rott_inq.html','http://www.pgatour.com/stats/categories.rapp_inq.html','http://www.pgatour.com/stats/categories.rarg_inq.html','http://www.pgatour.com/stats/categories.rput_inq.html','http://www.pgatour.com/stats/categories.rscr_inq.html','http://www.pgatour.com/stats/categories.rstr_inq.html','http://www.pgatour.com/stats/categories.rmny_inq.html','http://www.pgatour.com/stats/categories.rpts_inq.html'] in urls:     data = urlopen(i)     soup = beautifulsoup(data, "html.parser")     link in soup.find_all('a'):         if link.has_attr('href'):             alpha.append(base + link['href'][17:]) #may need adjusting #data links beta = [] in alpha:     if inn in i:         beta.append(i)  gamma = [] in beta:     if not in gamma:         gamma.append(i)  jan = [] in gamma:     try:         data = urlopen(i)         soup = beautifulsoup(data, "html.parser")         table in soup.find_all('section',{'class':'module-statistics-off-the-tee-details'}):             j in table.find_all('h3'):                 y=j.get_text().replace(" ","").replace("-","").replace(":","").replace(">","").replace("<","").replace(">","").replace(")","").replace("(","").replace("=","").replace("+","")                 jan.append([i,str(y+'.csv')])                 print([i,str(y+'.csv')])     except exception e:             print(e)             pass  #my problem starts here #using urls list can find error faster urls = [['http://www.pgatour.com/stats/stat.02356.html','d']     ,['http://www.pgatour.com/stats/stat.02568.html','f']     ,['http://www.pgatour.com/stats/stat.111.html','r']]         list = [] master = pd.dataframe() #jan = [['http://www.pgatour.com/stats/stat.02356.html', 'last15eventsscoring.csv']] #make list url , title name , cleaned csv name #write csv row_sp = [] rows_sp =[] title1 = []  title = []   in urls:     try:         y in years:             data = urlopen(i[0][:-4] +y+ end)             soup = beautifulsoup(data, "html.parser")             data1 = urlopen(i[0])             soup1 = beautifulsoup(data1, "html.parser")             table in soup1.find_all('table',{'id':'statstable'}):                 title.append('year')                 k in table.find_all('tr'):                     n in k.find_all('th'):                         title1.append(n.get_text())                         l in title1:                             if l not in title:                                 title.append(l)                 rows_sp.append(title)             table in soup.find_all('table',{'id':'statstable'}):                 h in table.find_all('tr'):                     row_sp = [y]                     j in h.find_all('td'):                         row_sp.append(j.get_text().replace(" ","").replace("\n","").replace("\xa0"," "))                     rows_sp.append(row_sp)             df=pd.dataframe(rows_sp)             df.columns = title             df.drop(df.index[1],inplace = true)             print(df)             list.append(df)     except exception e:         print(e)         pass df_merge = functools.reduce(lambda  left,right: pd.merge(left,right,on=['year','player name'], how='outer'), list) 


Comments

Popular posts from this blog

ubuntu - PHP script to find files of certain extensions in a directory, returns populated array when run in browser, but empty array when run from terminal -

php - How can i create a user dashboard -

javascript - How to detect toggling of the fullscreen-toolbar in jQuery Mobile? -