python - Retrieve hidden phones number with Scrapy -
here spider :
# -*- coding: utf-8 -*- import scrapy scrapy.spiders import crawlspider, rule scrapy.linkextractors import linkextractor class examplespider(crawlspider): name = 'example' allowed_domains = ['www.example.com'] start_urls = ['http://www.example.com'] rules = ( rule(linkextractor(allow=('/items/.'),deny=('sendmessage')), follow=true), rule(linkextractor(allow=('/item/[a-z\+]+\-[0-9]+') ,deny=('sendmessage')), callback='parse_item', follow=false), ) def parse_item(self, response): name = response.xpath('//*[@id="main"]/h1/text()').extract_first() locations = response.xpath("//*[@id='mylocation']//div[@class='location']") list_location = [] (i, location) in enumerate(locations): location_tab = {} phoneitem = [] data = [ ('somedata', ''), ('somedata', ''), ('somedata', ''), ] request = scrapy.formrequest( 'http://www.example.com/ajax/my-phone', formdata= data, callback=self.parse_for_number ) request.meta['phoneitem'] = phoneitem yield request location_tab['phone'] = phoneitem location_title = location.xpath("string(.//h3)").extract() location_title[0] = ' '.join(location_title[0].split()) location_address = location.xpath(".//p[@class='address']").extract() #if not location_title: # location_title = location.xpath("h3/a/text()").extract() location_tab['location_title'] = location_title[0].encode('utf-8') location_tab['location_address'] = location_address[0].encode('utf-8') list_location.append(location_tab) location_title = '' location_address = '' location_tab = {} if name : yield { 'name' : name, 'url' : url, 'locations' : list_location } def parse_for_number(self, response): phoneitem = request.meta['phoneitem'] phones = response.xpath("//li[@class='phone']/text()").extract() phone in phones: phoneitem.append(' '.join(phone.split())) yield phoneitem
my spider consists of retrieving list of person (name) in addition addresses , telephone number in each of these addresses (the phone number displayed through ajax request post variables).
the list of phones (phoneitem) remains empty. error?
edit:
how can store result of parse_for_number function in variable, , use after? since call formrequest in loop, can not yield phone variable each execution of function.
resolved !
i use requests library
import requests request = requests.post('http://www.example.com/ajax/my-phone', cookies=cookies, data=data) phones = selector(text=request.content).xpath("//li[@class='phone']/text()").extract()
Comments
Post a Comment