最新 详解如何模拟登录知乎 英文验证码以及中文倒立字(继上篇scrapy分析知乎( 三 )


ele_location=chinese_captcha_element.location##x_relative=ele_location["x"]y_relative=ele_location["y"]#这里是减去 浏览器上方工具栏的高度 保证Y坐标的精度browser_navigation_panel_height=browser.execute_script('return window.outerHeight - window.innerHeight;')
#两个倒立字时:#第一个倒立字的坐标 除2是因为 知乎的验证码 比 本地保存的小 2倍first_position=[int(last_position[0][0] / 2),int(last_position[0][1] / 2)]second_position=[int(last_position[1][0] / 2),int(last_position[1][1] / 2)]#鼠标move及clickmove(x_relative+first_position[0],y_relative+browser_navigation_panel_height+first_position[1])click()#等待3s 点击第二个time.sleep(3)move(x_relative + second_position[0],y_relative + browser_navigation_panel_height + second_position[1])click()#一个倒立字时:first_position = [int(last_position[0][0] / 2), int(last_position[0][1] / 2)]move(x_relative + first_position[0],y_relative + browser_navigation_panel_height + first_position[1])click()
倒立字点击后,点击登录即可;最终就成功了;
附上知乎所有源码~另一篇只有分析知乎的 。
一些参数需要自己修改成自己的~
# -*- coding: utf-8 -*-import jsonimport scrapyimport refrom selenium import webdriverfrom urllib import parsefrom scrapy.loader import ItemLoaderfrom douban.items import ZhihuAnswerItem,ZhihuQuestionItemimport datetimefrom zheye import zheyefrom mouse import move,clickimport timefrom selenium.webdriver.common.keys import Keysclass ZhihuSpider(scrapy.Spider):name = 'zhihu'allowed_domains = ['www.zhihu.com']start_urls = ['https://www.zhihu.com/']start_answer_url="https://www.zhihu.com/api/v4/questions/{0}/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_labeled%2Cis_recognized%2Cpaid_info%2Cpaid_info_content%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B*%5D.topics&offset={1}&limit={2}&sort_by=default&platform=desktop"headers = {# "HOST": "www.zhihu.com",# "Referer": "https://www.zhizhu.com",#User-Agent必不可少'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}def parse(self, response):#xpath检测知乎标题 取出所有a标签的href_urlall_urls = response.xpath("//a/@href").extract()#接下来遍历list类型的all_urls#因为all_urls = [parse.urljoin(response.url, url) for url in all_urls]for url in all_urls:# print(url)match_obj=re.match("(.*zhihu.com/question/(\d+)).*",url)if match_obj:request_url=match_obj.group(1)yield scrapy.Request(request_url,headers=self.headers,callback=self.parse_question)# breakelse:yield scrapy.Request(url,headers=self.headers,callback=self.parse)# print(request_url,question_id)def parse_question(self,response):match_obj = re.match("(.*zhihu.com/question/(\d+)).*", response.url)if match_obj:question_id = int(match_obj.group(2))item_loader=ItemLoader(item=ZhihuQuestionItem(),response=response)print(response)item_loader.add_css("title", 'h1.QuestionHeader-title::text')item_loader.add_css('content', '.QuestionHeader-detail')item_loader.add_value('url', response.url)item_loader.add_value('zhihu_id', question_id)item_loader.add_css('answer_num', '.List-headerText span::text')item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')item_loader.add_css('topics', '.QuestionHeader-topics .Popover div::text')item_loader.add_css('watch_user_num', '.NumberBoard-itemValue::text')question_item = item_loader.load_item()yield scrapy.Request(self.start_answer_url.format(question_id, 0, 20), headers=self.headers, callback=self.parse_answer)yield question_itemdef parse_answer(self,response):ans_json = json.loads(response.text)is_end = ans_json["paging"]["is_end"]next_url = ans_json["paging"]["next"]totals_num=ans_json["paging"]["totals"]#提取具体字段for answer in ans_json["data"]:answer_item = ZhihuAnswerItem()answer_item["zhihu_id"] = answer["id"]answer_item["url"] = answer["url"]answer_item["question_id"] = answer["question"]["id"]answer_item["author_id"] = answer["author"]["id"] if "id" in answer["author"] else Noneanswer_item["content"] = answer["content"] if "content" in answer else Noneanswer_item["praise_num"] = answer["voteup_count"]answer_item["comments_num"] = answer["comment_count"]answer_item["create_time"] = answer["created_time"]answer_item["update_time"] = answer["updated_time"]answer_item["crawl_time"] = datetime.datetime.now()yield answer_itemif not is_end:yield scrapy.Request(next_url,headers=self.headers,callback=self.parse_answer)def start_requests(self):# cookies=pickle.load(open("D:/Pythonstudy/douban/douban/spiders/cookies/zhihu_cookies","rb"))# cookie_dict = {}## for cookie in cookies:#cookie_dict[cookie['name']] = cookie['value']## return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]# 1.cmd 设置端口 启动chrome (启动之前确保所有chrome实例已经关闭 否则不成功)# 一行 js 代码识别 Selenium+Webdriver 及其应对方案 知乎的反爬措施 确定是否为模拟登录 window.navigator.webdriver##验证码操作chrome_option=webdriver.ChromeOptions()chrome_option.add_argument('--disable-extensions')chrome_option.add_experimental_option('debuggerAddress',"127.0.0.1:9222")# browser = webdriver.Chrome()browser=webdriver.Chrome(executable_path="D:/Envs/py3scrapy/Scripts/chromedriver.exe",chrome_options=chrome_option)browser.get("https://www.zhihu.com/")#最大化窗口,确保定位准确 如果已经是最大化 则passtry:browser.maximize_window()except:pass# time.sleep(30)# cookies = browser.get_cookies()## pickle.dump(cookies, open("D:/Pythonstudy/douban/douban/spiders/cookies/zhihu_cookies",'wb'))## cookie_dict = {}## for cookie in cookies:#cookie_dict[cookie['name']] = cookie['value']## return [scrapy.Request(url=self.start_urls[0], dont_filter=True, cookies=cookie_dict)]#进行切换登录及登录操作 延时看你心情change_login = browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div[1]/div/form/div[1]/div[2]')change_login.click()time.sleep(1)#采用ctrl+a 进行选中输入 进行覆盖 避免追加输入导致账号密码错误browser.find_element_by_name("username").send_keys(Keys.CONTROL+"a")browser.find_element_by_name("username").send_keys('num')time.sleep(1)browser.find_element_by_name("password").send_keys(Keys.CONTROL + "a")browser.find_element_by_name("password").send_keys('passwd')time.sleep(1)login = browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div[1]/div/form/button')login.click()#这里如果直接登录成功,则等待10s等待跳转后的页面加载完成time.sleep(10)#如果是登录Falselogin_success=Falsewhile not login_success:try:notify_ele=browser.find_element_by_class_name('Popover PushNotifications AppHeader-notifications')login_success=Trueexcept:pass#判断英文验证码还是倒立字验证码try:english_captcha_element = browser.find_element_by_class_name('Captcha-englishImg')except:english_captcha_element = Nonetry:chinese_captcha_element = browser.find_element_by_class_name('Captcha-chineseImg')except:chinese_captcha_element = None#如果是中文验证码if chinese_captcha_element:#进行倒立字图片的定位ele_location=chinese_captcha_element.location##x_relative=ele_location["x"]y_relative=ele_location["y"]#这里是减去 浏览器上方工具栏的高度 保证Y坐标的精度browser_navigation_panel_height=browser.execute_script('return window.outerHeight - window.innerHeight;')#取倒立字的src,并保存在本地进行base64的解码base64_text=chinese_captcha_element.get_attribute("src")import base64#这里 %0A 很奇葩code=base64_text.replace("data:image/jpg;base64,","").replace("%0A","")f=open("yzm_cn.jpeg","wb")f.write(base64.b64decode(code))f.close()#这里进行倒立字的坐标确定 github:zheyez=zheye()#图片路径positions = z.Recognize('yzm_cn.jpeg')last_position = []#只有两种情况 一个倒立字 和 两个倒立字#两个倒立字时if len(positions) == 2:#进行坐标的调整 直接复制即可 ,因为 默认是有问题的if positions[0][1] > positions[1][1]:last_position.append([positions[1][1], positions[1][0]])last_position.append([positions[0][1], positions[0][0]])else:last_position.append([positions[0][1], positions[0][0]])last_position.append([positions[1][1], positions[1][0]])#第一个倒立字的坐标 除2是因为 知乎的验证码 比 本地保存的小 2倍first_position=[int(last_position[0][0] / 2),int(last_position[0][1] / 2)]second_position=[int(last_position[1][0] / 2),int(last_position[1][1] / 2)]#鼠标move及clickmove(x_relative+first_position[0],y_relative+browser_navigation_panel_height+first_position[1])click()#等待3s 点击第二个time.sleep(3)move(x_relative + second_position[0],y_relative + browser_navigation_panel_height + second_position[1])click()#只有一个倒立字时else:last_position.append([positions[0][1], positions[0][0]])###first_position = [int(last_position[0][0] / 2), int(last_position[0][1] / 2)]move(x_relative + first_position[0],y_relative + browser_navigation_panel_height + first_position[1])click()#倒立字点击之后 重新进行账号密码的输入browser.find_element_by_name("username").send_keys(Keys.CONTROL + "a")browser.find_element_by_name("username").send_keys('num')time.sleep(1)browser.find_element_by_name("password").send_keys(Keys.CONTROL + "a")browser.find_element_by_name("password").send_keys('passwd')time.sleep(1)login = browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div[1]/div/form/button')login.click()if english_captcha_element:#这里解码同中文倒立字base64_text = english_captcha_element.get_attribute("src")import base64code = base64_text.replace("data:image/jpg;base64,","").replace("%0A", "")f = open("yzm_en.jpeg", "wb")f.write(base64.b64decode(code))f.close()#采用云解码提供的API 在线打码from douban.tools.yundama_requests import YDMHttp#用户username passwd appID appKey#填写你自己的数据,需要自己修改appid=11111yundama=YDMHttp("user","passwd",appid,'appkey')#图片路径 及 打码方式code=yundama.decode("yzm_en.jpeg",5000,60)#判断是否成功while True:if code == "":code = yundama.decode("yzm_en.jpeg",5000,60)else:break#成功后自动输入验证码browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div[1]/div/form/div[4]/div/div/label/input').send_keys(code)#重新输入账号密码browser.find_element_by_name("username").send_keys(Keys.CONTROL + "a")browser.find_element_by_name("username").send_keys('num')time.sleep(1)browser.find_element_by_name("password").send_keys(Keys.CONTROL + "a")browser.find_element_by_name("password").send_keys('passwd')time.sleep(1)login = browser.find_element_by_xpath('//*[@id="root"]/div/main/div/div/div[1]/div/form/button')login.click()pass