python+selenium+phantomjs can not handle https url which `adding certification is strongly advised` -
i try use python+selenium+phantomjs handle https request,but fail receive response server shows me adding certification advised
,here get_request function.
def get_random_x_forwarded_for(): # 得到随机x-forwarded-for值 numbers = [] while not numbers or numbers[0] in (10, 172, 192): numbers = random.sample(range(1, 255), 4) return '.'.join(str(_) _ in numbers) def get_random_ua(): # 得到随机user-agent值 import os if os.path.exists("dicts/user-agent.txt"): f = open(modulepath + "dicts/user-agents.txt", "r+") all_user_agents = f.readlines() f.close() else: all_user_agents = [ "mozilla/4.0 (mozilla/4.0; msie 7.0; windows nt 5.1; fdm; sv1)", "mozilla/4.0 (mozilla/4.0; msie 7.0; windows nt 5.1; fdm; sv1; .net clr 3.0.04506.30)", "mozilla/4.0 (windows; msie 7.0; windows nt 5.1; sv1; .net clr 2.0.50727)", "mozilla/4.0 (windows; u; windows nt 5.0; en-us) applewebkit/532.0 (khtml, gecko) chrome/3.0.195.33 safari/532.0", "mozilla/4.0 (windows; u; windows nt 5.1; en-us) applewebkit/525.19 (khtml, gecko) chrome/1.0.154.59 safari/525.19", "mozilla/4.0 (compatible; msie 6.0; linux i686 ; en) opera 9.70", ] random_ua_index = random.randint(0, len(all_user_agents) - 1) ua = re.sub(r"(\s)$", "", all_user_agents[random_ua_index]) return ua def get_request(url, by="mechanicalsoup", proxyurl="", cookie=""): code = none title = none content = none if == "seleniumphantomjs": hasformaction=false formactionvalue="" selenium import webdriver selenium.common.exceptions import timeoutexception import time if proxyurl == "" or proxyurl == 0: service_args_value = ['--ignore-ssl-errors=true', '--ssl-protocol=any'] if proxyurl != "" , proxyurl != 0: proxytype = proxyurl.split(":")[0] proxyvaluewithtype = proxyurl.split("/")[-1] service_args_value = ['--ignore-ssl-errors=true', '--ssl-protocol=any', '--proxy=%s' % proxyvaluewithtype, '--proxy-type=%s' % proxytype] # final_url=driver.current_url # print("正在访问的url是这个:\n"+final_url) # driver.quit() try: selenium.webdriver.common.desired_capabilities import desiredcapabilities if cookie != "": dcap = dict(desiredcapabilities.phantomjs) dcap["phantomjs.page.settings.cookie"] = cookie dcap["phantomjs.page.settings.useragent"] = get_random_ua() driver = webdriver.phantomjs(service_args=service_args_value, desired_capabilities=dcap) else: driver = webdriver.phantomjs(service_args=service_args_value) print(111111111) driver.implicitly_wait(5) driver.set_page_load_timeout(5) driver.get(url) print(222222222222222) # http://www.cnblogs.com/fnng/p/3269450.html originalcookie = driver.get_cookies() #print("current cookie is:\n" + str(originalcookie)) import random code = 200 title = driver.title content = driver.page_source a=re.search(r'''(<.*type=('|")?submit('|")?.*>)''',content,re.i) if a: hasformaction=true print(a.group(1)) input(67666) else: pass print("len content :\n" + str(len(content))) print("title :\n" + title) if re.search(r"(页面不存在)|(未找到页面)|(page not found)|(404)",title+content,re.i): return get_request(url,by="mechanicalsoup") # time.sleep(5) # let user see something! # driver.quit() except timeoutexception e: # handle exception here print(e) finally: driver.quit() return { 'code': code, 'title': title, 'content': content, #true or false 'hasformaction':hasformaction, #eg,https://www.baidu.com^a=1&b=2 #eg,https://www.baidu.com/?a=1&b=2 'formactionvalue':formactionvalue} else: import mechanicalsoup try: browser = mechanicalsoup.browser(soup_config={"features": "lxml"}) ua = get_random_ua() browser.session.headers.update({'user-agent': '%s' % ua}) # headers=browser.session.headers # if 'cookie' in headers: # originalcookie=headers['cookie'] if cookie == "": pass else: browser.session.headers.update({'cookie': '%s' % cookie}) # print(originalcookie) x_forwarded_for = get_random_x_forwarded_for() browser.session.headers.update( {'x-forwarded-for': '%s' % x_forwarded_for}) result = browser.get(url, timeout=10,verify=false) # print(dir(result)) code = result.status_code content = result.content import chardet bytesencoding = chardet.detect(content)['encoding'] # print(bytesencoding) content = content.decode(bytesencoding) title = beautifulsoup(content, "lxml").title if title not none: title_value = title.string else: title_value = none except: code = 0 title_value = "you may blocked or code doesn't handle ssl certificate well" content = 'can not html content time,may blocked server request' return_value = { 'code': code, 'title': title_value, 'content': content} # print("访问当前url为:\n\t"+url+"\ntitle如下:") # print("\t"+str(return_value['title'])) return return_value b=get_request("https://www.zoomeye.org/search/advanced",by="seleniumphantomjs") #b=get_request("https://www.zoomeye.org/search/advanced",by="mechanicalsoup") print(b)
when use b=get_request("https://www.zoomeye.org/search/advanced",by="seleniumphantomjs")
,it print 11111..1 , stuck there without print 2222.2,then try use b=get_request("https://www.zoomeye.org/search/advanced",by="mechanicalsoup")
,this time below error debug info:
/usr/local/lib/python3.6/site-packages/requests/packages/urllib3/connectionpool.py:852: insecurerequestwarning: unverified https request being made. adding certificate verification advised. see: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings insecurerequestwarning)
i want use python3+selenium+phantomjs handle adding certificate verification advised
error,can me?
later,i tried use phantomjs directly without selenium:
phantomjs --ignore-ssl-errors=true --ssl-protocol=any --web-security=false 1.js
1.js has below content:
var webpage = require('webpage'); var page = webpage.create(); console.log('test666'); page.open('https://www.zoomeye.org/search/advanced', function (status) { var content = page.content; console.log('content: ' + content); phantom.exit(); });
however,it doesn't work again,may phantomjs's bug:(
go through below url. provide capabilities need use https phantomjs :-
http://phantomjs.org/api/command-line.html
main capabilities required below :-
- --web-security=[true|false] enables web security , forbids cross-domain xhr (default true). accepted: [yes|no].
- --ssl-protocol=[sslv3|sslv2|tlsv1|tlsv1.1|tlsv1.2|any'] sets ssl protocol secure connections (default sslv3). not values may supported, depending on system openssl library.
- --ignore-ssl-errors=[true|false] ignores ssl errors, such expired or self-signed certificate errors (default false). accepted: [yes|no].
i have java code works fine me:-
private static capabilities getphantomcapabilities(string os) { desiredcapabilities capabilities = null; arraylist<string> cliargscap = new arraylist<string>(); capabilities = desiredcapabilities.phantomjs(); cliargscap.add("--web-security=false"); cliargscap.add("--ssl-protocol=any"); cliargscap.add("--ignore-ssl-errors=true"); capabilities.setcapability("takesscreenshot", true); capabilities.setjavascriptenabled(true); capabilities.setcapability( phantomjsdriverservice.phantomjs_cli_args, cliargscap); capabilities.setcapability( phantomjsdriverservice.phantomjs_ghostdriver_cli_args, new string[] { "--loglevel=2" }); return capabilities; }
convert code in python. should work
hope :)
Comments
Post a Comment