zoukankan      html  css  js  c++  java
  • 爬虫

    爬虫:向网站发起请求,获取资源后分析并提取有用数据的程序

    流程

      1、发起请求:使用http库向目标站点发起请求,即发送一个Request,Request包含:请求头、请求体等

      2、获取响应内容:如果服务器能正常响应,则会得到一个Response,Response包含:html,json,图片,视频等

      3、解析内容:解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等。解析json数据:json模块。解析二进制数据:以b的方式写入文件

      4、保存数据:数据库。文件

    工具

      请求库:requests,selenium

      解析库:正则,beautifulsoup,pyquery

      存储库:文件,MySQL,Mongodb,Redis

    框架:scrapy

    安装:pip3 install requests

    基本请求

    import requests
    res = requests.get('https://www.baidu.com')
    res.encoding = 'utf-8'
    print(res.text)
    with open('a.html', 'w') as f:
        f.write(res.text)

    带参数的GET请求->params

    import requests
    res = requests.get('https://www.baidu.com/s',
                       params={'wd':'图片'},
                       headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
                                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                                'Accept-Encoding': 'gzip, deflate, br',
                                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                                'Cache-Control': 'no-cache',
                                'Connection': 'keep-alive',
                                'Cookie': 'BD_UPN=12314753; PSTM=1572350125; BAIDUID=79D0925D8720B930D1F1E5BFF612720F:FG=1; BIDUPSID=AA6E74403EED680B571512C161DCBEA9; BDUSS=EyeXBkQXJNZ1Q0QXk0dzhoTlh1ODFzUzNwa0lySWJwMFBrOVJHMS1SNn5ILTFkRVFBQUFBJCQAAAAAAAAAAAEAAACxNoeFsM3A6GZlbGzIyAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL-SxV2~ksVdRE; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ispeed_lsm=2; BD_HOME=1; H_PS_PSSID=1449_21086_18560_20698_29567_29220_26350; delPer=0; BD_CK_SAM=1; PSINO=3; H_PS_645EC=2d24IwpbvK2eVobcmeLgWHGcv8LmvTpWTYgrzRwRetwbEpdCPi08ahOlrNs; COOKIE_SESSION=15438_1_7_5_14_10_0_1_3_5_39_3_72210_0_0_0_1574650244_1574491787_1574665633%7C9%233409_3_1574491763%7C2',
                                'Host': 'www.baidu.com',
                                'Pragma': 'no-cache',
                                'Sec-Fetch-Mode': 'navigate',
                                'Sec-Fetch-Site': 'none',
                                'Sec-Fetch-User': '?1',
                                'Upgrade-Insecure-Requests': '1'
                                })
    res.encoding = 'gbk'
    print(res.text)
    
    with open('a.html', 'w') as f:
        f.write(res.text)

    带参数的GET请求->headers      华华手机商城

    import requests
    headers = {'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Fuser.php%3Fact%3Dlogout',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
    # 登录时发送的请求
    res = requests.post('http://www.aa7a.cn/user.php',
                        headers=headers,
                        data={
                            'username': '2960113637@qq.com',
                            'password':'zrh960906*',
                            'captcha': 'GC3T',
                            'remember': 1,
                            'ref': 'http://www.aa7a.cn/',
                            'act': 'act_login'
                        })
    cookie=res.cookies.get_dict()  # 登录成功,获取cookie
    res=requests.get('http://www.aa7a.cn/',headers=headers,
                     cookies=cookie,
                     )
    if '2960113637@qq.com' in res.text:
        print("登录成功")
    else:
        print("没有登录")

    梨视频

    import requests
    import re
    res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')   # 刷新页面得到的数据
    reg_text = '<a href="(.*?)" class="vervideo-lilink actplay">'   # html页面
    obj = re.findall(reg_text,res.text)
    # print(obj)
    for url in obj:
        url = 'https://www.pearvideo.com/'+ url  # 拼接路径
        res1 = requests.get(url)
        obj1 = re.findall('srcUrl="(.*?)"',res1.text)
        print(obj1[0])
        name = obj1[0].rsplit('/',1)[1]
        res2 = requests.get(obj1[0])
        with open (name,'wb') as f:
            for line in res2.iter_content():
                f.write(line)

     ssl        https=http+ssl  

    import requests
    respone=requests.get('https://www.12306.cn',
                         cert=('/path/server.crt',
                               '/path/key'))
    print(respone.status_code)

    ip 代理收费(通过代理访问自己的服务,在服务端取出客户端ip查看一下)

    import requests
    proxies={
        'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码
        'http':'http://localhost:9743',
        'https':'https://localhost:9743',
        'http':'http://124.205.155.148:9090'
    }
    respone=requests.get('https://www.12306.cn',
                         proxies=proxies)
    
    print(respone.status_code)

    超时设置

    import requests
    respone=requests.get('https://www.baidu.com',
                         timeout=0.0001)

    上传文件

    import requests
    files={'file':open('a.jpg','rb')}
    respone=requests.post('http://httpbin.org/post',files=files)
    print(respone.status_code)

    安装:easy_install lxml

    爬取建材   import requests

    for i in range(5):
        response = requests.get('http://shop.jc001.cn/1373528/goods/?p=%s'%(i))
        response.encoding = 'gbk'
        # # print(response.text)
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(response.text,'lxml')
        # # res = soup.prettify()
        # # print(res)
      
      
      # copy selector # body
    > div:nth-child(3) > div > div.col-md-9.col-md-push-3 > div > div.cnt.clearfix.line > ul > li:nth-child(1) > a res = soup.select('body > div:nth-child(3) > div > div.col-md-9.col-md-push-3 > div > div.cnt.clearfix.line > ul > li > a') # print(res) for i,v in enumerate(res): print(v.attrs['href'])

    获取手机

    import requests, json
    url = 'http://10.23.255.15/v1/api/provider'
    # 请求头
    headers = {"Cookie":"csrftoken=pAJN4t4EcLs9UH0nCzoevqn7dd2HzYIxLKA873Hm1p6EZd7PPAgukvM9UKM9N7qu; sessionid=7kabh663t34qt4he03ittndf48ikjdni", "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
    data = requests.request('get', url, headers=headers)
    data = json.loads(data.text)   #得到网页的所有数据
    print(data)
    
    list = []
    list2 = []
    for i in range(len(data)):
        for j in range(len(data[i]['device_details'])):
            # 将所有手机添加到空列表中
            list.append(data[i]["device_details"][j]["manufacturer"])
    # 去重
    list1 = set(list)
    for i in list1:
        list2.append(i)
    # 将手机设备以字典方式存储
    dict = {}
    for i in range(len(list2)):
        dict[list2[i]] = []
    for i in range(len(data)):
        for j in range(len(data[i]['device_details'])):
            dict[data[i]["device_details"][j]["manufacturer"]].append('Android://' + data[i]["device_details"][j]["provider_ip"] + ':5039/' +data[i]["device_details"][j]["serialno"])
    print(dict)
    
    # 将手机存入excel表中
    l = 0
    import xlwt
    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook(encoding = 'utf-8')
    # 创建一个worksheet
    worksheet = workbook.add_sheet('My Worksheet')
    # 写入excel
    worksheet.write(0, 0, label='品牌')
    worksheet.write(0, 1, label='数量')
    worksheet.write(0, 2, label='设备信息')
    for i in range(len(list2)):
        for j in range(len(dict[list2[i]])):
            worksheet.write(2+j+l, 2, label=dict[list2[i]][j])
        worksheet.write(1+l, 0, label=list2[i])
        worksheet.write(1+l, 1, label=len(dict[list2[i]]))
        worksheet.write(1+l, 2, label=','.join(dict[list2[i]]))
        l = l+len(dict[list2[i]])+1
    workbook.save('test.xls')

    find:  -name="标签名"

       -id,class_,=""  把这个标签拿出来  

      -标签.text 取标签的内容
      -标签.get(属性名) 取标签属性的内容

    find_all

    import requests
    from bs4 import BeautifulSoup
    url='https://www.autohome.com.cn/news/1/#liststart'
    res=requests.get(url)
    #生成一个bs4对象
    soup=BeautifulSoup(res.text,'lxml')
    div=soup.find(id='auto-channel-lazyload-article')
    #div 是个对象
    # print(type(div))
    
    ul=div.find(name='ul')   #只找第一个ul标签
    # ul_list=div.find_all(class_="article")   #找出下面所有类名为article的标签
    # print(len(ul_list))
    li_list=ul.find_all(name='li')
    # print(len(li_list))
    for li in li_list:
        h3=li.find(name='h3')
        if h3:
            title=h3.text  #把h3标签的text取出来
            print(title)
        a=li.find(name='a')
        if a:
            article_url=a.get('href')  #取出a标签的href属性
            print(article_url)
    
        img=li.find(name='img')
        if img:
            img_url=img.get('src')
            print(img_url)
        p=li.find(name='p')
        if p:
            content=p.text
            print(content)

    查找文档      五种过滤器 :字符串,正则,布尔,方法,列表

    from bs4 import BeautifulSoup
    html_doc = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    
    <p class="title" id="bbaa"><b name="xx" age="18">The Dormouse's story</b><b>xxxx</b></p>
    <p class="xxx" a="xxx">asdfasdf</p>
    
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    
    <p class="story">...</p>
    """
    
    # soup=BeautifulSoup(html_doc,'lxml')
    # ress=soup.prettify()   #美化一下
    # soup=BeautifulSoup(ress,'lxml')
    # print(ress)
    
    #遍历文档树
    # print(soup.p.name)
    # print(soup.p.attrs)
    # print(soup.p.string)
    # print(list(soup.p.strings))
    # print(soup.p.text)
    
    import re
    print(soup.find_all(name='b'))  # 字符串
    
    print(soup.find_all(name=re.compile('^b')))  # 正则
    print(soup.find_all(id=re.compile('^b')))
    
    print(soup.find_all(name=['a','b']))  # 列表
    
    print(soup.find_all(name=True))  # 布尔
    
    def has_class_but_no_id(tag):  # 方法
        return tag.has_attr('class') and not tag.has_attr('id')
    print(soup.find_all(name=has_class_but_no_id))

    css 选择

    sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>",'lxml')
    print(sibling_soup.b.next_sibling)
    print(sibling_soup.c.previous_sibling )

    自动打开百度

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    import time
    bro=webdriver.Chrome()
    bro.get('https://www.baidu.com')
    #取到输入框
    inp=bro.find_element_by_id('kw')
    #往框里写字
    inp.send_keys("图片")
    inp.send_keys(Keys.ENTER) #输入回车
    time.sleep(3)
    bro.close()

     百度自动登录

    from selenium import webdriver
    import time
    bro = webdriver.Chrome()
    bro.get("https://www.baidu.com")
    bro.implicitly_wait(10)
    dl_button=bro.find_element_by_link_text("登录")
    dl_button.click()
    user_login=bro.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
    user_login.click()
    time.sleep(1)
    input_name=bro.find_element_by_name('userName')
    input_name.send_keys("2960113637@qq.com")
    input_password=bro.find_element_by_id("TANGRAM__PSP_10__password")
    input_password.send_keys("zrh960906")
    submit_button=bro.find_element_by_id('TANGRAM__PSP_10__submit')
    time.sleep(1)
    submit_button.click()
    time.sleep(10)
    print(bro.get_cookies())
    bro.close()

    京东购物

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    bro = webdriver.Chrome()
    bro.get("https://www.jd.com")
    bro.implicitly_wait(5)
    
    def get_goods(bro):
        print("------------------------------------")
        goods_li = bro.find_elements_by_class_name('gl-item')
        for good in goods_li:
            img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')
            if not img_url:
                img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
            url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
            price = good.find_element_by_css_selector('.p-price i').text
            name = good.find_element_by_css_selector('.p-name em').text.replace('
    ', '')
            commit = good.find_element_by_css_selector('.p-commit a').text
            print('''
                    商品链接:%s
                    商品图片:%s
                    商品名字:%s
                    商品价格:%s
                    商品评论数:%s
    
                    ''' % (url, img_url, name, price, commit))
        next_page = bro.find_element_by_partial_link_text("下一页")
        time.sleep(1)
        next_page.click()
        time.sleep(1)
        get_goods(bro)
    
    input_search=bro.find_element_by_id('key')
    input_search.send_keys("大裤衩")
    input_search.send_keys(Keys.ENTER)
    
    try:
        get_goods(bro)
    except Exception as e:
        print('结束')
    finally:
        bro.close()

    selenium拿cookie

    #获取属性:
    # tag.get_attribute('src')
    #获取文本内容
    # tag.text
    #获取标签ID,位置,名称,大小(了解)
    # print(tag.id)
    # print(tag.location)
    # print(tag.tag_name)
    # print(tag.size)
    
    #模拟浏览器前进后退
    # browser.back()
    # time.sleep(10)
    # browser.forward()
    
    #cookies管理
    # print(browser.get_cookies())  获取cookie
    # browser.add_cookie({'k1':'xxx','k2':'yyy'})  设置cookie
    # print(browser.get_cookies())
    
    #运行js
    # from selenium import webdriver
    # import time
    #
    # bro=webdriver.Chrome()
    # bro.get("http://www.baidu.com")
    # bro.execute_script('alert("hello world")') #打印警告
    # time.sleep(5)
    #选项卡管理
    # import time
    # from selenium import webdriver
    #
    # browser=webdriver.Chrome()
    # browser.get('https://www.baidu.com')
    # browser.execute_script('window.open()')
    #
    # print(browser.window_handles) #获取所有的选项卡
    # browser.switch_to_window(browser.window_handles[1])
    # browser.get('https://www.taobao.com')
    # time.sleep(3)
    # browser.switch_to_window(browser.window_handles[0])
    # browser.get('https://www.sina.com.cn')
    # browser.close()
    
    #动作链
    # from selenium import webdriver
    # from selenium.webdriver import ActionChains
    #
    # from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    # import time
    #
    # driver = webdriver.Chrome()
    # driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    # wait=WebDriverWait(driver,3)
    # # driver.implicitly_wait(3)  # 使用隐式等待
    #
    # try:
    #     driver.switch_to.frame('iframeResult') ##切换到iframeResult
    #     sourse=driver.find_element_by_id('draggable')
    #     target=driver.find_element_by_id('droppable')
    #
    #
    # #方式一:基于同一个动作链串行执行
    # # actions=ActionChains(driver) #拿到动作链对象
    # # actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行
    # # actions.perform()
    #
    # #方式二:不同的动作链,每次移动的位移都不同
    #
    #
    #     ActionChains(driver).click_and_hold(sourse).perform()
    #     distance=target.location['x']-sourse.location['x']
    #
    #
    #     track=0
    #     while track < distance:
    #         ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
    #         track+=2
    #
    #     ActionChains(driver).release().perform()
    #
    #     time.sleep(10)
    #
    #
    # finally:
    #     driver.close()

    获取cookie

    # import time
    # from selenium import webdriver
    # import json
    # browser = webdriver.Chrome()
    # browser.get('https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F')
    #
    # time.sleep(50)
    # cookie=browser.get_cookies()
    # print(cookie)
    # with open('cookie.json','w')as f:
    #     json.dump(cookie,f)
    
    
    
    import requests
    import json
    with open('cookie.json','r')as f:
        di=json.load(f)
    
    cookies = {}
    # 获取cookie中的name和value,转化成requests可以使用的形式
    for cookie in di:
        print(cookie)
        for key in cookie.keys():
            cookies[key] = cookie[key]
    
    print(cookies)
    res=requests.get('https://i-beta.cnblogs.com/api/user',
                 cookies=cookies)
    print(res.text)

    破解验证码

    import requests
    from selenium import webdriver
    import time
    import json
    url = 'https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F'
    driver = webdriver.Chrome()
    driver.get(url=url)
    time.sleep(50)
    driver.refresh()
    c = driver.get_cookies()
    print(c)
    with open('xxx.txt','w') as f:
        json.dump(c,f)
    
    time.sleep(3)
    with open('xxx.txt', 'r') as f:
        di = json.load(f)
    cookies = {}
    for cookie in di:
        cookies[cookie['name']] = cookie['value']
    print(cookies)
    
    headers = {
        # 'authority': 'www.jd.com',
        # 'method': 'GET',
        # 'path': '/',
        # 'scheme': 'https',
        # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        # 'accept-encoding': 'gzip, deflate, br',
        # 'accept-language': 'zh-CN,zh;q=0.9',
        # 'cache-control': 'max-age=0',
        # 'upgrade-insecure-requests': '1',
        'authority': 'i-beta.cnblogs.com',
        'method': 'GET',
        'path': '/',
        'scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'if-modified-since': 'Sun, 24 Nov 2019 06:14:53 GMT',
        # 'if-modified-since': 'Sun, 24 Nov 2019 06:14:53 GMT,
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    
    }
    # 使用该cookie完成请求
    response = requests.get(url='https://i-beta.cnblogs.com/api/user', headers=headers, cookies=cookies)
    print('xxx')
    response.encoding = response.apparent_encoding
    print(response.text)
    from urllib.parse import unquote_plus  #字符转中文
    from urllib.parse import urlencode  #中文转字符
    msg = '''
    "client_id=c3cef7c66a1843f8b3a9e6a1e3160e20&grant_type=password&timestamp=1574838172749&source=com.zhihu.web&signature=d9ca5ecd24ebcfd42360eabd392d860e837005d8&username=%2B8618953675221&password=lqz12345&captcha=&lang=cn&utm_source=&ref_source=other_https%3A%2F%2Fwww.zhihu.com%2Fsignin%3Fnext%3D%252F"
    '''
    print(unquote_plus(msg))

    登录知乎

    from requests_html import HTMLSession     #请求解析库     pip install requests-html
    import base64                             #base64解密加密库
    from PIL import Image                     #图片处理库
    import hmac                               #加密库
    from hashlib import sha1                  #加密库
    import time
    from urllib.parse import urlencode        #url编码库
    import execjs                             #python调用node.js    pip install PyExecJS
    from http import cookiejar
    
    class Spider():
        def __init__(self):
            self.session = HTMLSession()
            self.session.cookies = cookiejar.LWPCookieJar()    # 使cookie可以调用save和load方法
            self.login_page_url = 'https://www.zhihu.com/signin?next=%2F'
            self.login_api = 'https://www.zhihu.com/api/v3/oauth/sign_in'
            self.captcha_api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en'
            self.headers = {
                'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
            }
    
            self.captcha =''         #存验证码
            self.signature = ''    #存签名
    
        # 首次请求获取cookie
        def get_base_cookie(self):
            self.session.get(url=self.login_page_url, headers=self.headers)
    
        # 处理验证码
        def deal_captcha(self):
            r = self.session.get(url=self.captcha_api, headers=self.headers)
            r = r.json()
            if r.get('show_captcha'):
                while True:
                    r = self.session.put(url=self.captcha_api, headers=self.headers)
                    img_base64 = r.json().get('img_base64')
                    with open('captcha.png', 'wb') as f:
                        f.write(base64.b64decode(img_base64))
                    captcha_img = Image.open('captcha.png')
                    captcha_img.show()
                    self.captcha = input('输入验证码:')
                    r = self.session.post(url=self.captcha_api, data={'input_text': self.captcha},
                                          headers=self.headers)
                    if r.json().get('success'):
                        break
    
        def get_signature(self):
            # 生成加密签名
            a = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=sha1)
            a.update(b'password')
            a.update(b'c3cef7c66a1843f8b3a9e6a1e3160e20')
            a.update(b'com.zhihu.web')
            a.update(str(int(time.time() * 1000)).encode('utf-8'))
            self.signature = a.hexdigest()
    
        def post_login_data(self):
            data = {
                'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',
                'grant_type': 'password',
                'timestamp': str(int(time.time() * 1000)),
                'source': 'com.zhihu.web',
                'signature': self.signature,
                'username': '+8618217210664',
                'password': 'zrh960906*',
                'captcha': self.captcha,
                'lang': 'en',
                'utm_source': '',
                'ref_source': 'other_https://www.zhihu.com/signin?next=%2F',
            }
    
            headers = {
                'x-zse-83': '3_2.0',
                'content-type': 'application/x-www-form-urlencoded',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
            }
    
            data = urlencode(data)
            with open('ttt.js', 'rt', encoding='utf-8') as f:
                js = execjs.compile(f.read())
            data = js.call('b', data)
            print(data)
    
            r = self.session.post(url=self.login_api, headers=headers, data=data)
            if r.status_code == 201:
                self.session.cookies.save('mycookie')
                print('登录成功')
            else:
                print('登录失败')
        def login(self):
            self.get_base_cookie()
            self.deal_captcha()
            self.get_signature()
            self.post_login_data()
    
    if __name__ == '__main__':
        zhihu_spider = Spider()
        zhihu_spider.login()

    xpath选择

    doc='''
    <html>
     <head>
      <base href='http://example.com/' />
      <title>Example website</title>
     </head>
     <body>
      <div id='images'>
       <a href='image1.html' a="xxx">Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
       <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
       <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
       <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
       <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
       <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
      </div>
     </body>
    </html>
    '''
    from lxml import etree
    
    html=etree.HTML(doc)
    # html=etree.parse('search.html',etree.HTMLParser())
    # 1 所有节点
    a=html.xpath('//*')    #匹配所有标签
    # 2 指定节点(结果为列表)
    # a=html.xpath('//head')
    # 3 子节点,子孙节点
    a=html.xpath('//div/a')
    a=html.xpath('//body/a') #无数据
    a=html.xpath('//body//a')
    # 4 父节点
    # a=html.xpath('//body//a[@href="image1.html"]/..')
    a=html.xpath('//body//a[1]/..')  #从1开始
    # 也可以这样
    a=html.xpath('//body//a[1]/parent::*')
    # 5 属性匹配
    a=html.xpath('//body//a[@href="image1.html"]')
    
    # 6 文本获取
    a=html.xpath('//body//a[@href="image1.html"]/text()')
    a=html.xpath('//body//a/text()')
    
    # 7 属性获取
    # a=html.xpath('//body//a/@href')
    # # 注意从1 开始取(不是从0)
    a=html.xpath('//body//a[2]/@href')
    # 8 属性多值匹配
    #  a 标签有多个class类,直接匹配就不可以了,需要用contains
    # a=html.xpath('//body//a[@class="li"]')
    a=html.xpath('//body//a[contains(@class,"li")]/text()')
    # a=html.xpath('//body//a[contains(@class,"li")]/text()')
    # 9 多属性匹配
    a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
    a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
    a=html.xpath('//body//a[contains(@class,"li")]/text()')
    # 10 按序选择
    a=html.xpath('//a[2]/text()')
    a=html.xpath('//a[2]/@href')
    # 取最后一个
    a=html.xpath('//a[last()]/@href')
    # 位置小于3的
    a=html.xpath('//a[position()<3]/@href')
    # 倒数第二个
    a=html.xpath('//a[last()-2]/@href')
    # 11 节点轴选择
    # ancestor:祖先节点
    # 使用了* 获取所有祖先节点
    a=html.xpath('//a/ancestor::*')
    # # 获取祖先节点中的div
    a=html.xpath('//a/ancestor::div')
    # attribute:属性值
    a=html.xpath('//a[1]/attribute::*')
    # child:直接子节点
    a=html.xpath('//a[1]/child::*')
    # descendant:所有子孙节点
    a=html.xpath('//a[6]/descendant::*')
    # following:当前节点之后所有节点
    a=html.xpath('//a[1]/following::*')
    a=html.xpath('//a[1]/following::*[1]/@href')
    # following-sibling:当前节点之后同级节点
    a=html.xpath('//a[1]/following-sibling::*')
    a=html.xpath('//a[1]/following-sibling::a')
    a=html.xpath('//a[1]/following-sibling::*[2]/text()')
    a=html.xpath('//a[1]/following-sibling::*[2]/@href')
    
    print(a)
    dayehui
  • 相关阅读:
    vue+ajax+bootstrap+python实现增删改
    django笔记
    阻止冒泡事件的另一种写法
    c# 字符串的内存分配和驻留池( 转 )
    c# 几种深拷贝方式的比较
    JS 常用函数
    Web Api 过滤器之 AuthorizationFilter 验证过滤器
    c# await 到底等待的是什么?
    ( 转 ) CORS 有一次 OPTIONS 请求的原理
    Web Api 过滤器之 ExceptionFilter 异常过滤器
  • 原文地址:https://www.cnblogs.com/zrh-960906/p/11929149.html
Copyright © 2011-2022 走看看