python 3.x - Scraping information from website using beautifullsoup wont work -
i have been using beautiful soup extract info website http://slc.bioparadigms.org
but interested in diseases , omim number, each slc transporter have in list want extract these 2 characteristics. thing both related class prt_col2. if search class lot of hits. how can diseases? there no diseases related slc transporter or there no omim number. how can extract information? put screenshots below show how looks like. highly appreciated! first post here forgive me mistakes or missing information. thank you!
http://imgur.com/atigi84 other 1 /l65hsym
so ideally output example:
transporter: slc1a1
disease: epilepsy
omim: 12345
edit: code have far:
import os import re bs4 import beautifulsoup bs import requests import sys import time def hasnumbers(inputstring): #get transporter names contain numbers return any(char.isdigit() char in inputstring) def get_list(file): #get list of transporters transporter_list=[] lines = [line.rstrip('\n') line in open(file)] line in lines: if 'slc' in line , hasnumbers(line) == true: get_slc=line.split() if 'slc' in get_slc[0]: transporter_list.append(get_slc[0]) return transporter_list def get_transporter_webinfo(transporter_list): output_website=open("output_website.txt", "w") # website content of transporters transporter in transporter_list: text = requests.get('http://slc.bioparadigms.org/protein?genename=' + transporter).text output_website.write(text) #ouput slc tables website soup=bs(text, "lxml") disease = soup(text=re.compile('disease')) characteristics=soup.find_all("span", class_="prt_col2") memo=soup.find_all("span", class_='expandable prt_col2') print(transporter,disease,characteristics[6],memo) def convert(html_file): file2= open(html_file, 'r') clean_file= open('text_format_slc','w') soup=bs(file2,'lxml') clean_file.write(soup.get_text()) clean_file.close() def main(): start_time=time.time() os.chdir('/home/programming/fun stuff') sys.stdout= open("output_slc.txt","w") slc_list=get_list("slc.txt") get_transporter_webinfo(slc_list) #already have website content little redundant print("this took",time.time() - start_time, "seconds run") convert("output_slc.txt") sys.stdout.close() if __name__ == "__main__": main()
no offence intended, didn't feel reading such large piece of code put in question.
i can simplified.
you can complete list of links slcs in line @ slcs =
. next line shows how many there are, , line beyond exhibits href
attribute last link contains, example.
in each slc's page string 'disease' , then, if it's there, navigate link nearby. find omim in similar way.
notice process first slc.
>>> import requests >>> import bs4 >>> main_url = 'http://slc.bioparadigms.org/' >>> main_page = requests.get(main_url).content >>> main_soup = bs4.beautifulsoup(main_page, 'lxml') >>> stem_url = 'http://slc.bioparadigms.org/protein?genename=slc1a1' >>> slcs = main_soup.select('td.slct.tbl_cell.tbl_col1 a') >>> len(slcs) 418 >>> slcs[-1].attrs['href'] 'protein?genename=slc52a3' >>> stem_url = 'http://slc.bioparadigms.org/' >>> slc in slcs: ... slc_page = requests.get(stem_url+slc.attrs['href'], 'lxml').content ... slc_soup = bs4.beautifulsoup(slc_page, 'lxml') ... disease = slc_soup.find_all(string='disease: ') ... if disease: ... disease = disease[0] ... diseases = disease.findparent().findnextsibling().text.strip() ... else: ... diseases = 'no diseases' ... omim = slc_soup.find_all(string='omim:') ... if omim: ... omim = omim[0] ... number = omim.findparent().findnextsibling().text.strip() ... else: ... omim = 'no omim' ... number = -1 ... slc.text, number, diseases ... break ... ('slc1a1', '133550', "huntington's disease, epilepsy, ischemia, alzheimer's disease, niemann-pick disease, obsessive-compulsive disorder")
Comments
Post a Comment