zoukankan      html  css  js  c++  java
  • Python爬虫 —— 知乎之selenium模拟登陆获取cookies+requests.Session()访问+session序列化

    代码如下:

     1 # coding:utf-8
     2 from selenium import webdriver
     3 import requests
     4 import sys
     5 import time
     6 from lxml import etree
     7 import cPickle
     8 import os
     9 # reload(sys)
    10 # sys.setdefaultencoding('utf-8')
    11 
    12 class Zhihu:
    13     def __init__(self,homeurl):
    14         self.homeurl = homeurl
    15 
    16     def save_session(self,session): #保存session,下次可直接使用,避免再次登录
    17         with open('session.txt','wb') as f:
    18             cPickle.dump(session, f)
    19             print "Cookies have been writed."
    20 
    21     def load_session(self):     #加载session
    22         with open('session.txt', 'rb') as f:
    23             s = cPickle.load(f)
    24         return s
    25 
    26     def GetCookies(self):       #初次登录用selenium模拟,并获得cookies
    27         browser = webdriver.Chrome()
    28         browser.get("https://www.zhihu.com/signin")
    29         browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[1]/div[2]/div[1]/input").send_keys("13060882373")
    30         browser.find_element_by_xpath("//main//div[2]/div[1]/form/div[2]/div/div[1]/input").send_keys("xxxxxx")
    31         browser.find_element_by_xpath("//main//div[2]/div[1]/form/button").click()
    32         time.sleep(10)
    33         cookies = browser.get_cookies()
    34         browser.quit()
    35         return cookies
    36 
    37     def get_session(self):  #获取session
    38         s = requests.Session()
    39         if not os.path.exists('session.txt'):   #如果没有session,则创建一个,并且保存到文件中
    40             s.headers.clear()
    41             for cookie in self.GetCookies():
    42                 s.cookies.set(cookie['name'], cookie['value'])
    43             self.save_session(s)
    44         else:                                   #如果已存在session,则直接加载使用
    45             s = self.load_session()
    46         return s
    47 
    48     def Crawl(self):    #开始爬取
    49         s = self.get_session()
    50         html = s.get(self.homeurl).text
    51         html_tree = etree.HTML(html)
    52         items = html_tree.xpath('//main//div[1]/div[2]//div[@class="ContentItem AnswerItem"]/@data-zop')
    53         for item in items:
    54             content = eval(item)
    55             authorName = content['authorName']
    56             title = content['title']
    57             print authorName + "回答了:" + title
    58 
    59 zhihu = Zhihu('https://www.zhihu.com/')
    60 zhihu.Crawl()
  • 相关阅读:
    tyvj4751 NOIP春季系列课程 H's Problem (树状数组)
    卡牌分组([AtCoder ARC073]Ball Coloring)
    bzoj1036 [ZJOI2008]树的统计Count (树链剖分+线段树)
    bzoj2287 POJ Challenge 消失之物(背包)
    不能建立引用数组
    CString和string的区别
    防止应用程序重复启动
    public,protected,private
    ATL
    c++头文件中定义全局变量
  • 原文地址:https://www.cnblogs.com/DOLFAMINGO/p/9170429.html
Copyright © 2011-2022 走看看