zoukankan      html  css  js  c++  java
  • 爬虫

    爬虫:向网站发起请求,获取资源后分析并提取有用数据的程序

    流程

      1、发起请求:使用http库向目标站点发起请求,即发送一个Request,Request包含:请求头、请求体等

      2、获取响应内容:如果服务器能正常响应,则会得到一个Response,Response包含:html,json,图片,视频等

      3、解析内容:解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等。解析json数据:json模块。解析二进制数据:以b的方式写入文件

      4、保存数据:数据库。文件

    工具

      请求库:requests,selenium

      解析库:正则,beautifulsoup,pyquery

      存储库:文件,MySQL,Mongodb,Redis

    框架:scrapy

    安装:pip3 install requests

    基本请求

    import requests
    res = requests.get('https://www.baidu.com')
    res.encoding = 'utf-8'
    print(res.text)
    with open('a.html', 'w') as f:
        f.write(res.text)

    带参数的GET请求->params

    import requests
    res = requests.get('https://www.baidu.com/s',
                       params={'wd':'图片'},
                       headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
                                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                                'Accept-Encoding': 'gzip, deflate, br',
                                'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                                'Cache-Control': 'no-cache',
                                'Connection': 'keep-alive',
                                'Cookie': 'BD_UPN=12314753; PSTM=1572350125; BAIDUID=79D0925D8720B930D1F1E5BFF612720F:FG=1; BIDUPSID=AA6E74403EED680B571512C161DCBEA9; BDUSS=EyeXBkQXJNZ1Q0QXk0dzhoTlh1ODFzUzNwa0lySWJwMFBrOVJHMS1SNn5ILTFkRVFBQUFBJCQAAAAAAAAAAAEAAACxNoeFsM3A6GZlbGzIyAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL-SxV2~ksVdRE; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ispeed_lsm=2; BD_HOME=1; H_PS_PSSID=1449_21086_18560_20698_29567_29220_26350; delPer=0; BD_CK_SAM=1; PSINO=3; H_PS_645EC=2d24IwpbvK2eVobcmeLgWHGcv8LmvTpWTYgrzRwRetwbEpdCPi08ahOlrNs; COOKIE_SESSION=15438_1_7_5_14_10_0_1_3_5_39_3_72210_0_0_0_1574650244_1574491787_1574665633%7C9%233409_3_1574491763%7C2',
                                'Host': 'www.baidu.com',
                                'Pragma': 'no-cache',
                                'Sec-Fetch-Mode': 'navigate',
                                'Sec-Fetch-Site': 'none',
                                'Sec-Fetch-User': '?1',
                                'Upgrade-Insecure-Requests': '1'
                                })
    res.encoding = 'gbk'
    print(res.text)
    
    with open('a.html', 'w') as f:
        f.write(res.text)

    带参数的GET请求->headers      华华手机商城

    import requests
    headers = {'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Fuser.php%3Fact%3Dlogout',
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
    # 登录时发送的请求
    res = requests.post('http://www.aa7a.cn/user.php',
                        headers=headers,
                        data={
                            'username': '2960113637@qq.com',
                            'password':'zrh960906*',
                            'captcha': 'GC3T',
                            'remember': 1,
                            'ref': 'http://www.aa7a.cn/',
                            'act': 'act_login'
                        })
    cookie=res.cookies.get_dict()  # 登录成功,获取cookie
    res=requests.get('http://www.aa7a.cn/',headers=headers,
                     cookies=cookie,
                     )
    if '2960113637@qq.com' in res.text:
        print("登录成功")
    else:
        print("没有登录")

    梨视频

    import requests
    import re
    res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')   # 刷新页面得到的数据
    reg_text = '<a href="(.*?)" class="vervideo-lilink actplay">'   # html页面
    obj = re.findall(reg_text,res.text)
    # print(obj)
    for url in obj:
        url = 'https://www.pearvideo.com/'+ url  # 拼接路径
        res1 = requests.get(url)
        obj1 = re.findall('srcUrl="(.*?)"',res1.text)
        print(obj1[0])
        name = obj1[0].rsplit('/',1)[1]
        res2 = requests.get(obj1[0])
        with open (name,'wb') as f:
            for line in res2.iter_content():
                f.write(line)

     ssl        https=http+ssl  

    import requests
    respone=requests.get('https://www.12306.cn',
                         cert=('/path/server.crt',
                               '/path/key'))
    print(respone.status_code)

    ip 代理收费(通过代理访问自己的服务,在服务端取出客户端ip查看一下)

    import requests
    proxies={
        'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码
        'http':'http://localhost:9743',
        'https':'https://localhost:9743',
        'http':'http://124.205.155.148:9090'
    }
    respone=requests.get('https://www.12306.cn',
                         proxies=proxies)
    
    print(respone.status_code)

    超时设置

    import requests
    respone=requests.get('https://www.baidu.com',
                         timeout=0.0001)

    上传文件

    import requests
    files={'file':open('a.jpg','rb')}
    respone=requests.post('http://httpbin.org/post',files=files)
    print(respone.status_code)

    安装:easy_install lxml

    爬取建材   import requests

    for i in range(5):
        response = requests.get('http://shop.jc001.cn/1373528/goods/?p=%s'%(i))
        response.encoding = 'gbk'
        # # print(response.text)
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(response.text,'lxml')
        # # res = soup.prettify()
        # # print(res)
      
      
      # copy selector # body
    > div:nth-child(3) > div > div.col-md-9.col-md-push-3 > div > div.cnt.clearfix.line > ul > li:nth-child(1) > a res = soup.select('body > div:nth-child(3) > div > div.col-md-9.col-md-push-3 > div > div.cnt.clearfix.line > ul > li > a') # print(res) for i,v in enumerate(res): print(v.attrs['href'])

    获取手机

    import requests, json
    url = 'http://10.23.255.15/v1/api/provider'
    # 请求头
    headers = {"Cookie":"csrftoken=pAJN4t4EcLs9UH0nCzoevqn7dd2HzYIxLKA873Hm1p6EZd7PPAgukvM9UKM9N7qu; sessionid=7kabh663t34qt4he03ittndf48ikjdni", "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
    data = requests.request('get', url, headers=headers)
    data = json.loads(data.text)   #得到网页的所有数据
    print(data)
    
    list = []
    list2 = []
    for i in range(len(data)):
        for j in range(len(data[i]['device_details'])):
            # 将所有手机添加到空列表中
            list.append(data[i]["device_details"][j]["manufacturer"])
    # 去重
    list1 = set(list)
    for i in list1:
        list2.append(i)
    # 将手机设备以字典方式存储
    dict = {}
    for i in range(len(list2)):
        dict[list2[i]] = []
    for i in range(len(data)):
        for j in range(len(data[i]['device_details'])):
            dict[data[i]["device_details"][j]["manufacturer"]].append('Android://' + data[i]["device_details"][j]["provider_ip"] + ':5039/' +data[i]["device_details"][j]["serialno"])
    print(dict)
    
    # 将手机存入excel表中
    l = 0
    import xlwt
    # 创建一个workbook 设置编码
    workbook = xlwt.Workbook(encoding = 'utf-8')
    # 创建一个worksheet
    worksheet = workbook.add_sheet('My Worksheet')
    # 写入excel
    worksheet.write(0, 0, label='品牌')
    worksheet.write(0, 1, label='数量')
    worksheet.write(0, 2, label='设备信息')
    for i in range(len(list2)):
        for j in range(len(dict[list2[i]])):
            worksheet.write(2+j+l, 2, label=dict[list2[i]][j])
        worksheet.write(1+l, 0, label=list2[i])
        worksheet.write(1+l, 1, label=len(dict[list2[i]]))
        worksheet.write(1+l, 2, label=','.join(dict[list2[i]]))
        l = l+len(dict[list2[i]])+1
    workbook.save('test.xls')

    find:  -name="标签名"

       -id,class_,=""  把这个标签拿出来  

      -标签.text 取标签的内容
      -标签.get(属性名) 取标签属性的内容

    find_all

    import requests
    from bs4 import BeautifulSoup
    url='https://www.autohome.com.cn/news/1/#liststart'
    res=requests.get(url)
    #生成一个bs4对象
    soup=BeautifulSoup(res.text,'lxml')
    div=soup.find(id='auto-channel-lazyload-article')
    #div 是个对象
    # print(type(div))
    
    ul=div.find(name='ul')   #只找第一个ul标签
    # ul_list=div.find_all(class_="article")   #找出下面所有类名为article的标签
    # print(len(ul_list))
    li_list=ul.find_all(name='li')
    # print(len(li_list))
    for li in li_list:
        h3=li.find(name='h3')
        if h3:
            title=h3.text  #把h3标签的text取出来
            print(title)
        a=li.find(name='a')
        if a:
            article_url=a.get('href')  #取出a标签的href属性
            print(article_url)
    
        img=li.find(name='img')
        if img:
            img_url=img.get('src')
            print(img_url)
        p=li.find(name='p')
        if p:
            content=p.text
            print(content)

    查找文档      五种过滤器 :字符串,正则,布尔,方法,列表

    from bs4 import BeautifulSoup
    html_doc = """
    <html><head><title>The Dormouse's story</title></head>
    <body>
    
    <p class="title" id="bbaa"><b name="xx" age="18">The Dormouse's story</b><b>xxxx</b></p>
    <p class="xxx" a="xxx">asdfasdf</p>
    
    <p class="story">Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    and they lived at the bottom of a well.</p>
    
    <p class="story">...</p>
    """
    
    # soup=BeautifulSoup(html_doc,'lxml')
    # ress=soup.prettify()   #美化一下
    # soup=BeautifulSoup(ress,'lxml')
    # print(ress)
    
    #遍历文档树
    # print(soup.p.name)
    # print(soup.p.attrs)
    # print(soup.p.string)
    # print(list(soup.p.strings))
    # print(soup.p.text)
    
    import re
    print(soup.find_all(name='b'))  # 字符串
    
    print(soup.find_all(name=re.compile('^b')))  # 正则
    print(soup.find_all(id=re.compile('^b')))
    
    print(soup.find_all(name=['a','b']))  # 列表
    
    print(soup.find_all(name=True))  # 布尔
    
    def has_class_but_no_id(tag):  # 方法
        return tag.has_attr('class') and not tag.has_attr('id')
    print(soup.find_all(name=has_class_but_no_id))

    css 选择

    sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>",'lxml')
    print(sibling_soup.b.next_sibling)
    print(sibling_soup.c.previous_sibling )

    自动打开百度

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    import time
    bro=webdriver.Chrome()
    bro.get('https://www.baidu.com')
    #取到输入框
    inp=bro.find_element_by_id('kw')
    #往框里写字
    inp.send_keys("图片")
    inp.send_keys(Keys.ENTER) #输入回车
    time.sleep(3)
    bro.close()

     百度自动登录

    from selenium import webdriver
    import time
    bro = webdriver.Chrome()
    bro.get("https://www.baidu.com")
    bro.implicitly_wait(10)
    dl_button=bro.find_element_by_link_text("登录")
    dl_button.click()
    user_login=bro.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
    user_login.click()
    time.sleep(1)
    input_name=bro.find_element_by_name('userName')
    input_name.send_keys("2960113637@qq.com")
    input_password=bro.find_element_by_id("TANGRAM__PSP_10__password")
    input_password.send_keys("zrh960906")
    submit_button=bro.find_element_by_id('TANGRAM__PSP_10__submit')
    time.sleep(1)
    submit_button.click()
    time.sleep(10)
    print(bro.get_cookies())
    bro.close()

    京东购物

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    bro = webdriver.Chrome()
    bro.get("https://www.jd.com")
    bro.implicitly_wait(5)
    
    def get_goods(bro):
        print("------------------------------------")
        goods_li = bro.find_elements_by_class_name('gl-item')
        for good in goods_li:
            img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')
            if not img_url:
                img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
            url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
            price = good.find_element_by_css_selector('.p-price i').text
            name = good.find_element_by_css_selector('.p-name em').text.replace('
    ', '')
            commit = good.find_element_by_css_selector('.p-commit a').text
            print('''
                    商品链接:%s
                    商品图片:%s
                    商品名字:%s
                    商品价格:%s
                    商品评论数:%s
    
                    ''' % (url, img_url, name, price, commit))
        next_page = bro.find_element_by_partial_link_text("下一页")
        time.sleep(1)
        next_page.click()
        time.sleep(1)
        get_goods(bro)
    
    input_search=bro.find_element_by_id('key')
    input_search.send_keys("大裤衩")
    input_search.send_keys(Keys.ENTER)
    
    try:
        get_goods(bro)
    except Exception as e:
        print('结束')
    finally:
        bro.close()

    selenium拿cookie

    #获取属性:
    # tag.get_attribute('src')
    #获取文本内容
    # tag.text
    #获取标签ID,位置,名称,大小(了解)
    # print(tag.id)
    # print(tag.location)
    # print(tag.tag_name)
    # print(tag.size)
    
    #模拟浏览器前进后退
    # browser.back()
    # time.sleep(10)
    # browser.forward()
    
    #cookies管理
    # print(browser.get_cookies())  获取cookie
    # browser.add_cookie({'k1':'xxx','k2':'yyy'})  设置cookie
    # print(browser.get_cookies())
    
    #运行js
    # from selenium import webdriver
    # import time
    #
    # bro=webdriver.Chrome()
    # bro.get("http://www.baidu.com")
    # bro.execute_script('alert("hello world")') #打印警告
    # time.sleep(5)
    #选项卡管理
    # import time
    # from selenium import webdriver
    #
    # browser=webdriver.Chrome()
    # browser.get('https://www.baidu.com')
    # browser.execute_script('window.open()')
    #
    # print(browser.window_handles) #获取所有的选项卡
    # browser.switch_to_window(browser.window_handles[1])
    # browser.get('https://www.taobao.com')
    # time.sleep(3)
    # browser.switch_to_window(browser.window_handles[0])
    # browser.get('https://www.sina.com.cn')
    # browser.close()
    
    #动作链
    # from selenium import webdriver
    # from selenium.webdriver import ActionChains
    #
    # from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    # import time
    #
    # driver = webdriver.Chrome()
    # driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    # wait=WebDriverWait(driver,3)
    # # driver.implicitly_wait(3)  # 使用隐式等待
    #
    # try:
    #     driver.switch_to.frame('iframeResult') ##切换到iframeResult
    #     sourse=driver.find_element_by_id('draggable')
    #     target=driver.find_element_by_id('droppable')
    #
    #
    # #方式一:基于同一个动作链串行执行
    # # actions=ActionChains(driver) #拿到动作链对象
    # # actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行
    # # actions.perform()
    #
    # #方式二:不同的动作链,每次移动的位移都不同
    #
    #
    #     ActionChains(driver).click_and_hold(sourse).perform()
    #     distance=target.location['x']-sourse.location['x']
    #
    #
    #     track=0
    #     while track < distance:
    #         ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
    #         track+=2
    #
    #     ActionChains(driver).release().perform()
    #
    #     time.sleep(10)
    #
    #
    # finally:
    #     driver.close()

    获取cookie

    # import time
    # from selenium import webdriver
    # import json
    # browser = webdriver.Chrome()
    # browser.get('https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F')
    #
    # time.sleep(50)
    # cookie=browser.get_cookies()
    # print(cookie)
    # with open('cookie.json','w')as f:
    #     json.dump(cookie,f)
    
    
    
    import requests
    import json
    with open('cookie.json','r')as f:
        di=json.load(f)
    
    cookies = {}
    # 获取cookie中的name和value,转化成requests可以使用的形式
    for cookie in di:
        print(cookie)
        for key in cookie.keys():
            cookies[key] = cookie[key]
    
    print(cookies)
    res=requests.get('https://i-beta.cnblogs.com/api/user',
                 cookies=cookies)
    print(res.text)

    破解验证码

    import requests
    from selenium import webdriver
    import time
    import json
    url = 'https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F'
    driver = webdriver.Chrome()
    driver.get(url=url)
    time.sleep(50)
    driver.refresh()
    c = driver.get_cookies()
    print(c)
    with open('xxx.txt','w') as f:
        json.dump(c,f)
    
    time.sleep(3)
    with open('xxx.txt', 'r') as f:
        di = json.load(f)
    cookies = {}
    for cookie in di:
        cookies[cookie['name']] = cookie['value']
    print(cookies)
    
    headers = {
        # 'authority': 'www.jd.com',
        # 'method': 'GET',
        # 'path': '/',
        # 'scheme': 'https',
        # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        # 'accept-encoding': 'gzip, deflate, br',
        # 'accept-language': 'zh-CN,zh;q=0.9',
        # 'cache-control': 'max-age=0',
        # 'upgrade-insecure-requests': '1',
        'authority': 'i-beta.cnblogs.com',
        'method': 'GET',
        'path': '/',
        'scheme': 'https',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'if-modified-since': 'Sun, 24 Nov 2019 06:14:53 GMT',
        # 'if-modified-since': 'Sun, 24 Nov 2019 06:14:53 GMT,
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    
    }
    # 使用该cookie完成请求
    response = requests.get(url='https://i-beta.cnblogs.com/api/user', headers=headers, cookies=cookies)
    print('xxx')
    response.encoding = response.apparent_encoding
    print(response.text)
    from urllib.parse import unquote_plus  #字符转中文
    from urllib.parse import urlencode  #中文转字符
    msg = '''
    "client_id=c3cef7c66a1843f8b3a9e6a1e3160e20&grant_type=password&timestamp=1574838172749&source=com.zhihu.web&signature=d9ca5ecd24ebcfd42360eabd392d860e837005d8&username=%2B8618953675221&password=lqz12345&captcha=&lang=cn&utm_source=&ref_source=other_https%3A%2F%2Fwww.zhihu.com%2Fsignin%3Fnext%3D%252F"
    '''
    print(unquote_plus(msg))

    登录知乎

    from requests_html import HTMLSession     #请求解析库     pip install requests-html
    import base64                             #base64解密加密库
    from PIL import Image                     #图片处理库
    import hmac                               #加密库
    from hashlib import sha1                  #加密库
    import time
    from urllib.parse import urlencode        #url编码库
    import execjs                             #python调用node.js    pip install PyExecJS
    from http import cookiejar
    
    class Spider():
        def __init__(self):
            self.session = HTMLSession()
            self.session.cookies = cookiejar.LWPCookieJar()    # 使cookie可以调用save和load方法
            self.login_page_url = 'https://www.zhihu.com/signin?next=%2F'
            self.login_api = 'https://www.zhihu.com/api/v3/oauth/sign_in'
            self.captcha_api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en'
            self.headers = {
                'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
            }
    
            self.captcha =''         #存验证码
            self.signature = ''    #存签名
    
        # 首次请求获取cookie
        def get_base_cookie(self):
            self.session.get(url=self.login_page_url, headers=self.headers)
    
        # 处理验证码
        def deal_captcha(self):
            r = self.session.get(url=self.captcha_api, headers=self.headers)
            r = r.json()
            if r.get('show_captcha'):
                while True:
                    r = self.session.put(url=self.captcha_api, headers=self.headers)
                    img_base64 = r.json().get('img_base64')
                    with open('captcha.png', 'wb') as f:
                        f.write(base64.b64decode(img_base64))
                    captcha_img = Image.open('captcha.png')
                    captcha_img.show()
                    self.captcha = input('输入验证码:')
                    r = self.session.post(url=self.captcha_api, data={'input_text': self.captcha},
                                          headers=self.headers)
                    if r.json().get('success'):
                        break
    
        def get_signature(self):
            # 生成加密签名
            a = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=sha1)
            a.update(b'password')
            a.update(b'c3cef7c66a1843f8b3a9e6a1e3160e20')
            a.update(b'com.zhihu.web')
            a.update(str(int(time.time() * 1000)).encode('utf-8'))
            self.signature = a.hexdigest()
    
        def post_login_data(self):
            data = {
                'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',
                'grant_type': 'password',
                'timestamp': str(int(time.time() * 1000)),
                'source': 'com.zhihu.web',
                'signature': self.signature,
                'username': '+8618217210664',
                'password': 'zrh960906*',
                'captcha': self.captcha,
                'lang': 'en',
                'utm_source': '',
                'ref_source': 'other_https://www.zhihu.com/signin?next=%2F',
            }
    
            headers = {
                'x-zse-83': '3_2.0',
                'content-type': 'application/x-www-form-urlencoded',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
            }
    
            data = urlencode(data)
            with open('ttt.js', 'rt', encoding='utf-8') as f:
                js = execjs.compile(f.read())
            data = js.call('b', data)
            print(data)
    
            r = self.session.post(url=self.login_api, headers=headers, data=data)
            if r.status_code == 201:
                self.session.cookies.save('mycookie')
                print('登录成功')
            else:
                print('登录失败')
        def login(self):
            self.get_base_cookie()
            self.deal_captcha()
            self.get_signature()
            self.post_login_data()
    
    if __name__ == '__main__':
        zhihu_spider = Spider()
        zhihu_spider.login()

    xpath选择

    doc='''
    <html>
     <head>
      <base href='http://example.com/' />
      <title>Example website</title>
     </head>
     <body>
      <div id='images'>
       <a href='image1.html' a="xxx">Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
       <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
       <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
       <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
       <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
       <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
      </div>
     </body>
    </html>
    '''
    from lxml import etree
    
    html=etree.HTML(doc)
    # html=etree.parse('search.html',etree.HTMLParser())
    # 1 所有节点
    a=html.xpath('//*')    #匹配所有标签
    # 2 指定节点(结果为列表)
    # a=html.xpath('//head')
    # 3 子节点,子孙节点
    a=html.xpath('//div/a')
    a=html.xpath('//body/a') #无数据
    a=html.xpath('//body//a')
    # 4 父节点
    # a=html.xpath('//body//a[@href="image1.html"]/..')
    a=html.xpath('//body//a[1]/..')  #从1开始
    # 也可以这样
    a=html.xpath('//body//a[1]/parent::*')
    # 5 属性匹配
    a=html.xpath('//body//a[@href="image1.html"]')
    
    # 6 文本获取
    a=html.xpath('//body//a[@href="image1.html"]/text()')
    a=html.xpath('//body//a/text()')
    
    # 7 属性获取
    # a=html.xpath('//body//a/@href')
    # # 注意从1 开始取(不是从0)
    a=html.xpath('//body//a[2]/@href')
    # 8 属性多值匹配
    #  a 标签有多个class类,直接匹配就不可以了,需要用contains
    # a=html.xpath('//body//a[@class="li"]')
    a=html.xpath('//body//a[contains(@class,"li")]/text()')
    # a=html.xpath('//body//a[contains(@class,"li")]/text()')
    # 9 多属性匹配
    a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
    a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
    a=html.xpath('//body//a[contains(@class,"li")]/text()')
    # 10 按序选择
    a=html.xpath('//a[2]/text()')
    a=html.xpath('//a[2]/@href')
    # 取最后一个
    a=html.xpath('//a[last()]/@href')
    # 位置小于3的
    a=html.xpath('//a[position()<3]/@href')
    # 倒数第二个
    a=html.xpath('//a[last()-2]/@href')
    # 11 节点轴选择
    # ancestor:祖先节点
    # 使用了* 获取所有祖先节点
    a=html.xpath('//a/ancestor::*')
    # # 获取祖先节点中的div
    a=html.xpath('//a/ancestor::div')
    # attribute:属性值
    a=html.xpath('//a[1]/attribute::*')
    # child:直接子节点
    a=html.xpath('//a[1]/child::*')
    # descendant:所有子孙节点
    a=html.xpath('//a[6]/descendant::*')
    # following:当前节点之后所有节点
    a=html.xpath('//a[1]/following::*')
    a=html.xpath('//a[1]/following::*[1]/@href')
    # following-sibling:当前节点之后同级节点
    a=html.xpath('//a[1]/following-sibling::*')
    a=html.xpath('//a[1]/following-sibling::a')
    a=html.xpath('//a[1]/following-sibling::*[2]/text()')
    a=html.xpath('//a[1]/following-sibling::*[2]/@href')
    
    print(a)
    dayehui
  • 相关阅读:
    SharePoint Framework (SPFx) 开发入门教程
    SharePoint 2013 Designer 入门教程
    SharePoint 2013 开发教程
    SharePoint 2013 入门教程
    SharePoint Online 部署SPFx Web部件
    SharePoint Online SPFx Web部件绑定数据
    SharePoint Online 创建SPFx客户端Web部件
    SharePoint Online 配置框架(SPFx)开发环境
    SharePoint Online 创建应用程序目录
    SharePoint Online 启用 IRM
  • 原文地址:https://www.cnblogs.com/zrh-960906/p/11929149.html
Copyright © 2011-2022 走看看