zoukankan      html  css  js  c++  java
  • 1-由浅入深学爬虫

    爬虫

    爬虫入门

    urllib

    from urllib import request
    
    
    url = 'http://www.baidu.com'
    # User-Agent: 模拟浏览器,防止服务器反爬
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    # 使用request发送请求
    # 创建请求对象
    req = request.Request(url=url, headers=headers)
    # 发送请求
    response = request.urlopen(req)
    # 响应数据
    # print(response.read())  # 二进制
    print(response.read().decode())  # 解码,得到字符串
    # print(response.info())  # 响应信息
    # print(response.status)  # 状态码
    

    urllib模拟百度搜索

    from urllib import request
    from urllib import parse
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    
    def baidu_search(params, key):
        # 百度搜索url
        url = f'https://www.baidu.com/s?{params}'
        # 发送请求
        req =request.Request(url, headers=headers)
        res = request.urlopen(req)
        content =res.read().decode()
        print(content)
        # 保存爬取的数据
        with open(f'{key}.html', 'w', encoding='utf-8') as fp:
            fp.write(content)
            fp.flush()
    '''
    如果向服务器发送数据,那么data参数必须是一个有数据的bytes对象,否则为None。HTTP请求使用POST方法时,data必须有数据;使用GET方法时,data写成None
    data = bytes(parse.urlencode({"pro": "value"}, encoding="utf8"))
    response = request.urlopen("http://www.baidu.com", data=data)
    '''
    
    
    if __name__ == '__main__':
        key = input('请输入要搜索的内容')
        params ={'wd': key}
        params = parse.urlencode(params)  # 解决url中出现中文的问题
        # print(params)  # wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
        baidu_search(params, key)
    

    urllib爬取51job

    import re
    from urllib import request
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    # url
    url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,Python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    # 发送请求
    req = request.Request(url, headers=headers)
    res = request.urlopen(req)
    # 获取数据
    content = res.read().decode('gbk')
    # 使用正则
    pattern = '"jobid_count":"(.*?)"'  # 捕获
    result = re.findall(pattern, content, re.S)  # 让.可以匹配换行
    print(result)
    

    urllib下载图片

    from urllib import request
    
    
    # 下载图片
    request.urlretrieve(
        url='https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1600067380374&di=16addb0b6e336ab847a1403cebc09a43&imgtype=0&src=http%3A%2F%2Fgss0.baidu.com%2F-vo3dSag_xI4khGko9WTAnF6hhy%2Fzhidao%2Fpic%2Fitem%2Fb17eca8065380cd72cbb313da744ad34588281bd.jpg',
        filename='人民币.png'
    )
    request.urlcleanup()  # 清理缓存
    

    urllib爬取豆瓣电影

    import json
    from urllib import request
    import pymysql
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=0'
    req = request.Request(url, headers=headers)
    res = request.urlopen(req)
    # json解析:json反序列化
    # json一定要用双引号
    # 不能在json中注释
    content = res.read().decode()
    result = json.loads(content)
    '''
    # 1.将电影数据存入本地txt文件
    movie_list = result['data']
    for movie in movie_list:
        title = movie['title']
        url = movie['url']
        with open('douban.txt', 'a', encoding='utf-8') as fp:
            s = str((title, url)) + '
    '
            fp.write(s)
            fp.flush()
    '''
    # 2.将电影数据存储到MySQL
    # 连接MySQL
    db = pymysql.connect(
        host='localhost', port=3306,
        user='root', password='nzw19940611',
        database='spider2003', charset='utf8mb4'
    )
    cur = db.cursor()  # 游标:执行SQL
    # 执行SQL
    movie_list = result['data']
    for movie in movie_list:
        title = movie['title']
        url = movie['url']
        try:
            # sql
            sql = 'insert into tb_douban_movie(movie_title, url) values("%s", "%s")' % (title, url)
            cur.execute(sql)
            db.commit()  # 事务提交
        except Exception as e:
            print('插入失败:', e)
            db.rollback()  # 回滚
    print('--插入MySQL完成--')
    # content = eval(res.read().decode())
    # for i in range(len(content['data'])):
    #     with open('豆瓣.txt', 'a', encoding='utf-8') as fp:
    #         fp.write(content['data'][i]['title']+'
    ')
    #         fp.flush()
    

    urllib使用代理IP

    import random
    from urllib import request
    import json
    
    
    # 先获取芝麻代理ip
    url = 'http://http.tiqu.alicdns.com/getip3?num=10&type=2&pro=0&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions=&gm=4'
    # 请求芝麻代理API
    response = request.urlopen(url)
    content = response.read().decode()
    # print(content)
    # json解析,提取ip和port
    result = json.loads(content)
    ip_list = result['data']
    # 把ip格式化后存入proxy_list
    proxy_list = []
    for ip in ip_list:
        ip_dict = {
            'HTTP': f'{ip["ip"]}:{ip["port"]}'
        }
        proxy_list.append(ip_dict)
    # print(proxy_list)  # {'http': 'http://58.218.92.13:6905'}......
    # url = UserAgent池
    UserAgentList = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36"
    ]
    # 获取随机的代理IP
    proxy = random.choice(proxy_list)
    # 随机的UA
    ua = random.choice(UserAgentList)
    # 使用代理IP和UA
    proxy_handler = request.ProxyHandler(proxies=proxy)  # 构建一个代理
    opener = request.build_opener(proxy_handler)  # 使用构建的代理创建一个opener对象
    # 发送请求
    req = request.Request('http://www.baidu.com')
    req.add_header('User-Agent', ua)  # 随机的ua
    # 使用带代理的opener对象打开某个url/request
    response = opener.open(req)  # 等价于request.urlopen()
    res = response.read().decode()
    print(res)
    

    requests基础

    import requests
    
    
    # get请求
    '''
    response = requests.get('http://www.baidu.com')
    # print(response)  # <Response [200]>
    print(response.text)  # 默认使用utf-8解码,内容字符串
    print(response.content)  # 二进制
    # print(response.json())  # json解析
    
    # print(response.headers)  # 头部信息
    # print(response.cookies)  # 响应的cookie
    # print(response.status_code)  # 状态码
    '''
    
    '''
    # get请求:百度搜索
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    url = 'https://www.baidu.com/s?wd=hello'
    response = requests.get(url, headers=headers)
    print(response.text)
    '''
    
    # post请求:有道翻译
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
    kw = input('请输入要翻译的单词:')
    # data是post的参数
    data = {
        "i": kw,
        "from": "AUTO",
        "to": "AUTO",
        "smartresult": "dict",
        "client": "fanyideskweb",
        "salt": "16000738465941",
        "sign": "bf2e220fb6fe0ec8e03524a390dc0b5c",
        "lts": "1600073846594",
        "bv": "e915c77f633538e8cf44c657fe201ebb",
        "doctype": "json",
        "version": "2.1",
        "keyfrom": "fanyi.web",
        "action": "FY_BY_CLICKBUTTION"
    }
    response = requests.post(url, data=data, headers=headers)
    result = response.json()  # json解析,解析成字典
    src = result['translateResult'][0][0]['src']
    tgt = result['translateResult'][0][0]['tgt']
    print(src, tgt)
    

    bs4和xpath

    requests使用代理

    import random
    import requests
    
    
    '''
    58.218.200.228:9150
    58.218.200.223:4432
    58.218.200.226:8256
    58.218.200.228:7837
    58.218.200.223:8915
    '''
    # proxy
    proxy_list = [
        {"HTTP": "58.218.200.228:9150"},
        {"HTTP": "58.218.200.223:4432"},
        {"HTTP": "58.218.200.226:8256"},
        {"HTTP": "58.218.200.228:7837"},
        {"HTTP": "58.218.200.223:8915"}
    ]
    # 获取随机代理IP
    proxy = random.choice(proxy_list)
    # 使用代理
    res = requests.get('http://www.baidu.com', proxies=proxy)
    print(res.text)
    

    requests使用session

    import requests
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    url = 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=1&s=1&click=0'
    # 使用session
    session = requests.session()
    # 使用session发送请求:保持会话,存储cookie
    response = session.get(url, headers=headers)
    print(response.text)
    # 当继续使用session访问其他url时,会自动携带之前的cookie
    url2 = 'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=2&s=27&scrolling=y&log_id=1600483717480.6970&tpl=3_M&isList=1&show_items='
    response2 = session.get(url2, headers=headers)
    print(response2.text)
    

    requests使用cookies

    import requests
    
    
    url = 'http://www.baidu.com'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    cookies = {
        "PSTM": "1600136817",
        "BDRCVFR[feWj1Vr5u3D]": "mk3SLVN4HKm",
        "BAIDUID": " E922D90277D06E37B8B783C0082C650A:FG=1",
        "delPer": "0",
        "BD_CK_SAM": "1",
        "PSINO": "6",
        "H_PS_PSSID": "7506_32606_1424_7605_32116_31709_26350",
        "BIDUPSID": "89E6649E57A3DC9DABE613D88595BA0D",
        "BDORZ": "B490B5EBF6F3CD402E515D22BCDA1598",
        "BD_UPN": "12314753",
        "COOKIE_SESSION": "16_0_2_5_3_11_0_0_0_2_0_0_67596_0_0_0_1600136510_0_1600136818%7C5%230_0_1600136818%7C1",
        "H_PS_645EC": "3fcbYEWAxGp5VGowaCXsud%2BK436DuYp%2Bu6fs%2FUwAz9UFcCyuSSHqbS7CSMLQBpsMjeN%2F"
    }
    response = requests.get(url, headers=headers, cookies=cookies)
    # print(response.text)
    # print(response.cookies)
    # 将服务器返回的cookiejar,转换成字典dict
    cookie_dict = requests.utils.dict_from_cookiejar(response.cookies)
    print(cookie_dict)
    

    bs4基本用法

    from bs4 import BeautifulSoup
    # 安装Beautifulsop4
    # pip install Beacutifulsoup4
    # 安装HTML解析器lxml
    
    
    html_doc = """
    <html>
        <head>
            <title>呵呵</title>
        </head>
        <body>
        <p class="title">
            <b>哈哈</b>
        </p>
        <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="first" class="sister" id="link1">first</a>,
        <a href="second" class="sister" id="link2">second</a> and
        <a href="third" class="sister" id="link3">third</a>;
        </p>
        <p class="story">end</p>
        </body>
    </html>
    """
    # 使用bs4
    # 创建bs4对象
    soup = BeautifulSoup(html_doc, 'lxml')
    # print(soup)
    # print(type(soup))
    # tag标签
    # print(soup.head)
    # print(type(soup.head))  # <class 'bs4.element.Tag'>
    # print(soup.title)  # title标签
    # print(soup.b)  # 哈哈
    # print(soup.body.p.b)
    # attribute属性
    # print(soup.p.attrs)  # {'class': ['title']}第一个p所有属性
    # print(soup.a.attrs)  # {'href': 'first', 'class': ['sister'], 'id': 'link1'}第一个a的所有属性
    # print(soup.a.attrs['href'])  # 获取某个属性值
    # 文本内容,建议使用text
    # print(soup.b.string)  # 哈哈
    # print(soup.b.text)  # 哈哈
    # print(soup.p.string)  # None
    # print(soup.p.text)  # 哈哈
    # find_all():找到所有匹配的节点
    # print(soup.find_all('p'))  # 所有p节点
    # print(soup.find_all('p')[2])
    # 根据属性来查找
    # print(soup.find_all('p', attrs={'class': 'story'}))
    # print(soup.find_all('a', attrs={'id': 'link1'}))
    # print(soup.find_all('a', id='link1'))
    # print(soup.find_all('a', limit=2))  # 前两个a标签
    # print(soup.find_all(['a', 'b']))  # 找所有a标签和b标签
    # css选择器
    # soup.select()
    # print(soup.select('p'))  # 标签选择器
    # print(soup.select('#link2'))  # id选择器
    # print(soup.select('.sister'))  # class选择器
    # print(soup.select('p #link3'))  # 后代选择器
    # 从文档中获取所有文字内容
    print(soup.get_text())
    

    bs4解析股票基金数据

    import requests
    from bs4 import BeautifulSoup
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    # 股票网址
    url = 'http://quote.stockstar.com/fund/stock.shtml'
    response = requests.get(url, headers=headers)
    content = response.content.decode('gb2312')
    # print(content)
    # bs4解析网页内容
    soup = BeautifulSoup(content, 'lxml')
    tr_list = soup.select('#datalist tr')
    # print(tr_list)
    for tr in tr_list:
        s_code = tr.find_all('td')[0].a.text  # 基金代码
        s_name = tr.find_all('td')[1].a.text  # 基金名称
        s_unit = tr.find_all('td')[2].text  # 单位
        s = str((s_code, s_name, s_unit)) + '
    '
        with open('fund.txt', 'a', encoding='utf-8') as fp:
            fp.write(s)
            fp.flush()
    

    xpath的基本使用

    # xpath需要安装lxml
    # pip install lxml
    from lxml import etree
    
    
    html_doc = """
    <html>
        <head>
            <title>呵呵</title>
        </head>
        <body>
            <ul>
                <li class="item" id="box1">
                    <a href="aabb">打仗1</a>
                </li>
                <li class="item" id="box2">
                    <a href="aabbcc">打仗2</a>
                </li>
                <li class="item" id="box3">
                    <a href="bbccdd">打仗3</a>
                </li>
                <li class="item" id="box4">
                    <a href="ddee">打仗4</a>
                </li>
            </ul>
            <p class="item">
                <a href="aabb">打仗5</a>
            </p>
        </body>
    </html>
    """
    # 使用xpath
    # 创建etree对象
    mytree = etree.HTML(html_doc)
    # print(mytree)  # <Element html at 0x1feda822e08>
    # print(type(mytree))  # <class 'lxml.etree._Element'>
    # /:子节点
    # //:后代节点
    # print(mytree.xpath('/html'))  # html标签
    # print(mytree.xpath('/html/head'))  # head标签
    # print(mytree.xpath('/html/body/ul/li'))  # 所有li标签
    # print(mytree.xpath('//li'))  # 所有li标签
    # print(mytree.xpath('//li')[1])  # 第二个li标签,得到etree对象
    # print(mytree.xpath('//li[2]/@id'))
    # text():文本内容
    # li_list = mytree.xpath('//li')
    # for li in li_list:
    #     # 里面的.表示当前节点,不能省略
    #     content = li.xpath('./a/text()')  # 文本内容
    #     attr = li.xpath('./@id')  # 属性值
    #     print(content, attr)
    # 谓语:加条件
    # 谓词写在[]中
    # print(mytree.xpath('//li[1]/a/text()'))  # ['打仗1']
    # print(mytree.xpath('//li[last()]/a/text()'))  # ['打仗4']
    # print(mytree.xpath('//li[last()-1]/a/text()'))  # ['打仗3'],倒数第二个
    # print(mytree.xpath('//li[position()<3]/a/text()'))  # ['打仗1', '打仗2']
    # print(mytree.xpath('//li[position()>=3]/a/text()'))  # ['打仗3', '打仗4']
    # print(mytree.xpath('//li[@id="box1"]/a/text()'))  # ['打仗1']
    # print(mytree.xpath('//li[@class="item"]/a/text()'))  # ['打仗1', '打仗2', '打仗3', '打仗4']
    # *通配符
    # print(mytree.xpath('//*[@class="item"]/a/text()'))  # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']
    # |或
    # print(mytree.xpath('//li[@class="item"]/a/text() | //p[@class="item"]/a/text()'))  # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']  # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']
    # 包含contains()
    # print(mytree.xpath('//li/a[contains(@href, "aa")]/text()'))  # ['打仗1', '打仗2']
    print(mytree.xpath('//li/a[contains(text(), "2")]/text()'))  # ['打仗2']
    

    xpath解析股票基金数据

    import requests
    from lxml import etree
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    # 股票网址
    url = 'http://quote.stockstar.com/fund/stock.shtml'
    response = requests.get(url, headers=headers)
    content = response.content.decode('gb2312')
    # print(content)
    # xpath解析网页内容
    mytree = etree.HTML(content)
    tr_list = mytree.xpath('//tbody[@id="datalist"]/tr')
    for i, tr in enumerate(tr_list):
        f_code = tr.xpath('./td[1]/a/text()')[0]
        f_name = tr.xpath('./td[2]/a/text()')[0]
        f_unit = tr.xpath('./td[3]/text()')[0]
        # csv文件
        with open('fund.csv', 'a', encoding='gb2312') as fp:
            if i==0:
                fp.write('基金代码,基金名称,单位净值
    ')
            f = f'{f_code},{f_name},{f_unit}
    '
            fp.write(f)
            fp.flush()
    

    selenium和验证码破解

    超级鹰破解验证码

    import requests
    from hashlib import md5
    
    
    class Chaojiying_Client(object):
        def __init__(self, username, password, soft_id):
            self.username = username
            password = password.encode('utf8')
            self.password = md5(password).hexdigest()
            self.soft_id = soft_id
            self.base_params = {
                'user': self.username,
                'pass2': self.password,
                'softid': self.soft_id,
            }
            self.headers = {
                'Connection': 'Keep-Alive',
                'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
            }
        def PostPic(self, im, codetype):
            """
            im: 图片字节
            codetype: 题目类型 参考 http://www.chaojiying.com/price.html
            """
            params = {
                'codetype': codetype,
            }
            params.update(self.base_params)
            files = {'userfile': ('ccc.jpg', im)}
            r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
            return r.json()
        def ReportError(self, im_id):
            """
            im_id:报错题目的图片ID
            """
            params = {
                'id': im_id,
            }
            params.update(self.base_params)
            r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
            return r.json()
    
    
    if __name__ == '__main__':
        chaojiying = Chaojiying_Client('lotuslaw', '******', '908114')
        # 用户中心>>软件ID 生成一个替换 96001
        img = open('../a.jpg', 'rb').read()
        #本地图片文件路径 来替换 a.jpg
        print(chaojiying.PostPic(img, 1902))
        # 1902 验证码类型
    

    selenium的基本用法

    import time
    from selenium import webdriver
    
    # 创建浏览器驱动
    # 可以手动配置驱动的路径
    # 将chromedriver.exe放到python.exe同目录
    from selenium.webdriver.common.keys import Keys
    driver = webdriver.Chrome()
    # 打开浏览器
    driver.get('http://www.baidu.com')
    # 获取网页源码
    # print(driver.page_source)
    # 关闭
    # time.sleep(5)
    # driver.close()  # 只关闭一个窗口
    # driver.quit()  # 退出,关闭所有窗口
    # 百度贴吧
    driver.get('https://tieba.baidu.com/index.html')
    '''
    # 查找元素
    wd1 = driver.find_element_by_id('wd1')
    # wd1.send_keys('美女', Keys.ENTER)  # 给输入框填充内容,自动按回车
    time.sleep(2)
    wd1.send_keys('美女')
    # 点击按钮
    btn = driver.find_element_by_xpath('//a[@class="search_btn search_btn_enter_ba j_enter_ba"]')
    # btn.click()
    # 获取内容和属性值
    print(btn.get_attribute('innerText'))  # innerText, innerHTML
    print(wd1.get_attribute('value'))  # 输入框的值
    '''
    # 执行js
    time.sleep(3)
    # 执行JS脚本
    # driver.execute_script('window.scrollBy(0, 5000)')
    for i in range(5):
        driver.execute_script('window.scrollBy(0,5000)')
        time.sleep(2)
    # 截图
    # driver.save_screenshot('teiba.png')
    

    selenium登录知乎

    import time
    from selenium import webdriver
    
    
    # 知乎登录页面
    url = 'https://www.zhihu.com/signin?next=%2F'
    # 打开知乎页面
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(2)
    # 点击qq
    driver.find_element_by_xpath('//*[@class="Button Login-socialButton Button--plain"][2]').click()
    # 停10秒,手动扫码登录
    time.sleep(10)
    # 刷新页面
    driver.refresh()
    # 获取页面
    print(driver.page_source)
    print(driver.get_cookies())
    

    headless无头浏览器

    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    
    # 在正常页面跑通流程后,使用无头浏览器节约资源
    options = ChromeOptions()
    options.add_argument('--headless')  # 无头浏览器
    options.add_argument('--disable-gpu')  # 禁用GPU
    # 创建驱动对象
    driver = webdriver.Chrome(options=options)
    driver.get('http://www.baidu.com')
    print(driver.page_source)
    

    selenium设置代理

    from selenium import webdriver
    
    
    options = webdriver.ChromeOptions()
    # 设置代理IP
    options.add_argument('--proxy-sever=http://58.218.200.226:8256')
    # 创建驱动
    driver = webdriver.Chrome(options=options)
    driver.get('http://www.baidu.com')
    print(driver.page_source)
    

    超级鹰人人网验证码破解登录

    from Day03.chaojiying import chaojiying
    import requests
    import random
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
    }
    
    def get_code():
        url = 'http://icode.renren.com/getcode.do?t=web_login&rnd=' + str(random.random())
    
        # 获取验证码图片
        res = session.get(url, headers=headers)
        content = res.content  # 图片二进制
    
        # 使用超级鹰破解
        cjy = chaojiying.Chaojiying_Client('lotuslaw', '******', '908114')
        code = cjy.PostPic(content, 1902)
        # print(code)
        return code
    
    
    def login(code):
        # 登录接口抓取:给一个错误的密码进行登录
        login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020831616448'
    
        login_data = {
            "email": "18566218480",
            "icode": code,
            "origURL": "http://www.renren.com/home",
            "domain": "renren.com",
            "key_id": "1",
            "captcha_type": "web_login",
            "password": "88d7f48bf698c0f1b0dcca94bfb40361c6c82ced70f8cbf0619d725e0341d2e5",
            "rkey": "e8d80414c49ceb424291126858ee6226",
            "f": ''
        }
        # 发送请求
        res = session.post(login_url, data=login_data, headers=headers)
        content = res.text
        print(content)
        
        
    # 登录后访问个人中心
    def user_center():
        url = 'http://www.renren.com/480089210/profile'
        res = session.get(url, headers=headers)
        print(res.text)
    
    
    if __name__ == '__main__':
        session = requests.session()
        code = get_code()
        login(code)
        user_center()
    

    Scrapy基础

    • Scrapy框架介绍

      • Scrapy是用纯Python实现的一个为了爬取网站数据、提取结构性数据而编写的应用框架,用途非常广泛
      • 用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页内容以及各种图片,非常之方便
      • Scrapy 使用了Twisted(其主要对手是Tornado)多线程异步网络框架来处理网络通讯,可以加快我们的下载速度,不用自己去实现异步框架,并且包含了各种中间件接口,可以灵活的完成各种需求
    • Scrapy框架组件

      • Scrapy Engine(引擎)
        • 负责Spider、ItemPipeline、Downloader、Scheduler中间的通讯,信号、数据传递等
      • Scheduler(调度器)
        • 它负责接受引擎发送过来的Request请求,并按照一定的方式进行整理排列,入队,当引擎需要时,交还给引擎
      • Downloader(下载器)
        • 负责下载Scrapy Engine(引擎)发送的所有Requests请求,并将其获取到的Responses交还给Scrapy Engine(引擎),由引擎交给Spider来处理
      • Spider(爬虫)
        • 它负责处理所有Responses,从中分析提取数据,获取Item字段需要的数据,并将需要跟进的URL提交给引擎,再次进入Scheduler(调度器)
      • Item Pipeline(管道)
        • 它负责处理Spider中获取到的Item,并进行后期处理(详细分析、过滤、存储等)的地方
      • Downloader Middlewares(下载中间件)
        • 你可以当作是一个可以自定义扩展下载功能的组件
      • Spider Middlewares(Spider中间件)
        • 你可以理解为是一个可以自定扩展和操作引擎Spider中间通信的功能组件(比如进入Spider的Responses和从Spider出去的Requests)
    • 安装使用

      • 安装

      • 使用

        • 新建项目

          • 进入项目存放的目录
          • scrapy startproject meiju
        • 创建爬虫程序

          • 进入项目目录
          • scrapy genspider meijuSpider meijutt.tv
            • meijuSpider为爬虫文件名
            • meijutt.tv为爬取网址的域名
          • 工程文件介绍
            • scrapy.cfg
              • 项目的配置信息,主要为Scrapy命令行工具提供一个基础的配置信息。(真正爬虫相关的配置信息在settings.py文件中)
            • items.py
              • 设置数据存储模板,用于结构化数据,如:Django的Model
            • pipelines
              • 数据处理行为,如:一般结构化的数据持久化
            • settings.py
              • 配置文件,如:递归的层数、并发数,延迟下载等
            • spiders
              • 爬虫目录,如:创建文件,编写爬虫规则
        • 定义Item

          • class MeijuItem(scrapy.Item):
                name = scrapy.Field()
            
        • 编写爬虫

          • 修改起始爬取的url

          • 数据处理

          • def parse(self, response):
                item = MeijuItem()
                item['name'] = name
                yield item
            
        • 启用一个Item Pipeline组件,在settings.py开启

          • ITEM_PIPELINES = {
               'meiju.pipelines.MeijuPipeline': 300,
            }
            
        • 编写 Pipeline 来存储提取到的Item

          • 快速存储

            • 在运行文件中存储

            • scrapy crawl meijuSpider -o meiju.json
              scrapy crawl meijuSpider -o meiju.csv
              scrapy crawl meijuSpider -o meiju.xml
              
        • 运行爬虫

          • 命令行运行

            • scrapy crawl meijuSpider
            • scrapy crawl meijuSpider --nolog
          • 新建一个运行文件start

          • # 执行scrapy命令:开启爬虫
            scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju'])
            # scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju', '--nolog'])
            scrapy.cmdline.execute('scrapy crawl mymeiju'.split())
            # scrapy.cmdline.execute('scrapy crawl mymeiju --nolog'.split())
            

    Scrapy爬取美剧网

    • mymeiju.py
    import scrapy
    from ..items import MeijuItem
    
    
    class MymeijuSpider(scrapy.Spider):
        # 爬虫名:唯一
        name = 'mymeiju'
        # 允许的域名列表
        allowed_domains = ['meijutt.tv']
        # 开始的url列表:启动项目后回直接自动爬取的url列表
        start_urls = ['https://www.meijutt.tv/new100.html']
        # 解析数据方法:
        # 1.当start_urls中的网页请求完成后回自动调用当前的parse方法,并返回响应
        def parse(self, response, **kwargs):
            print('*' * 100)
            # print(response)
            # print(type(response))  # <class 'scrapy.http.response.html.HtmlResponse'>
            # print(response.text)  # 获取文本内容
            # print(response.body)  # 二进制内容
            # print(response.json())  # 解析json
            print('*' * 100)
            # 解析数据:xpath
            li_list = response.xpath('//ul[@class="top-list  fn-clear"]/li')
            for li in li_list:
                # 有3种方式获取内容
                # name = li.xpath('./h5/a/text()').get()
                # name = li.xpath('./h5/a/text()')[0].extract()
                # name = li.xpath('./h5/a/text()').extract_first()
                # name = li.xpath('./span/text()').getall()  # 获取所有匹配的内容,他是一个列表
                name = li.xpath('./h5/a/text()').get()  # 剧名
                state = li.xpath('./span[1]/font/text()').get()  # 状态:级数
                mjzm = li.xpath('./span[2]/em/text()').get()  # 字幕
                mjjq = li.xpath('./span[3]/text()').get()  # 分类
                mjtv = li.xpath('./span[4]/text()').get()  # 电视台
                mjtime = li.xpath('./div[last]/font/text()').get()  # 更新时间
                if not mjtime:
                    mjtime = li.xpath('./div[last()]/text()').get()
                # print(name)
                # item:封装每个数据
                # item = MeijuItem()
                # item['name'] = name  # 不能用点语法
                item = MeijuItem(
                    name=name, state=state, mjzm=mjzm,
                    mjjq=mjjq, mjtv=mjtv, mjtime=mjtime
                )
                # 生成器,既是迭代器,又是可迭代对象
                yield item
                # 这里的item回传入到pipelines中,需要做两个事情
                # 1.需要在parse方法中yield item
                # 2.需要在settings中将ITEM_PIPELINES设置好
                # yield返回2种值
                # 1.返回item
                # 2.返回Request/FormRequest
    
    • items.py
    import scrapy
    
    
    # Item: 类似Django种的Model
    class MeijuItem(scrapy.Item):
        name = scrapy.Field()
        state = scrapy.Field()
        mjzm = scrapy.Field()
        mjjq = scrapy.Field()
        mjtv = scrapy.Field()
        mjtime = scrapy.Field()
    
    • pipelines.py
    from itemadapter import ItemAdapter
    
    
    # pipeline:专门用来存储数据
    class MeijuPipeline:
        # 开始爬虫:自动调用该函数一次
        def open_spider(self, spider):
            pass
            # 打开文件
            # self.fp = open('meiju.txt', 'a', encoding='utf-8')
            # print('开始爬取......')
    
        # 关闭爬虫:自动调用
        def close_spider(self, spider):
            pass
            # 关闭文件
            # self.fp.close()
            # print('爬虫结束!')
    
        # process_item:会被调用很多次(取决于yield item的次数)
        def process_item(self, item, spider):
            # print(spider.name)  # 爬虫名
            # print(f'item:{item}', type(item))
            # 写入文件
            # with open('meiju.txt', 'a', encoding='utf-8') as fp:
            #     fp.write(str(item) + '
    ')
            # self.fp.write(str(item) + '
    ')
            # print(f'{item["name"]}写入成功')
            return item
    
    • settings.py
    BOT_NAME = 'meiju'
    SPIDER_MODULES = ['meiju.spiders']
    NEWSPIDER_MODULE = 'meiju.spiders'
    ROBOTSTXT_OBEY = False
    ITEM_PIPELINES = {
       'meiju.pipelines.MeijuPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    # 执行scrapy命令:开启爬虫
    # scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju'])
    # scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju', '--nolog'])
    
    # 使用split
    # scrapy.cmdline.execute('scrapy crawl mymeiju'.split())
    # scrapy.cmdline.execute('scrapy crawl mymeiju --nolog'.split())
    
    # 快速存储成指定格式的文件
    # 支持的文件格式('json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle')
    # scrapy.cmdline.execute('scrapy crawl mymeiju -o meiju2.json'.split())
    scrapy.cmdline.execute('scrapy crawl mymeiju -o meiju2.csv'.split())
    

    Scrapy爬取当当网

    • dangdang_spider.py
    import scrapy
    from ..items import DangdangItem
    
    
    class DangdangSpiderSpider(scrapy.Spider):
        name = 'dangdang_spider'
        allowed_domains = ['dangdang.com']
        start_urls = ['http://category.dangdang.com/pg1-cp01.01.02.00.00.00.html']
        def parse(self, response, **kwargs):
            li_list = response.xpath('//ul[@id="component_59"]/li')
            for li in li_list:
                book_name = li.xpath('./a/@title').get()
                book_price = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()').get()
                book_author = li.xpath('./p[@class="search_book_author"]/span[1]/a/text()').get()
                book_publishers = li.xpath('./p[@class="search_book_author"]/span[3]/a/text()').get()
                book_star = li.xpath('./p[@class="search_star_line"]/span/span/@style').get()[6:-1]
                book_comment = li.xpath('./p[4]/a/text()').get()
                book_picture = li.xpath('./a/img/@data-original')
                if book_picture:
                    book_picture = book_picture.get()
                else:
                    book_picture = li.xpath('./a/img/@src').get()
                print(book_picture)
                item = DangdangItem(
                    book_name=book_name,
                    book_price=book_price,
                    book_author=book_author,
                    book_publishers=book_publishers,
                    book_star=book_star,
                    book_comment=book_comment,
                    book_picture=book_picture
                )
                yield item
    
    • items.py
    import scrapy
    
    
    class DangdangItem(scrapy.Item):
        book_name = scrapy.Field()
        book_price = scrapy.Field()
        book_author = scrapy.Field()
        book_publishers = scrapy.Field()
        book_star = scrapy.Field()
        book_comment = scrapy.Field()
        book_picture = scrapy.Field()
    
    • pipelines.py
    import pymysql
    
    
    class DangdangPipeline:
        def open_spider(self, spider):
            print('开始爬取')
            self.db = pymysql.connect(
                host='localhost',
                port=3306,
                user='root',
                password='******',
                database='spider2003',
                charset='utf8'
            )
            self.cur = self.db.cursor()
        def close_spider(self, spider):
            print('爬取结束')
            self.cur.close()
            self.db.close()
        def process_item(self, item, spider):
            # item['name'].replace('"', "'")  # 单引号替换双引号
            sql = 'insert into dangdang(book_name, book_price, book_author, book_publishers, book_star, book_comment, book_picture) values ("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (item['book_name'], item['book_price'], item['book_author'], item['book_publishers'], item['book_star'], item['book_comment'], item['book_picture'])
            try:
                self.cur.execute(sql)
                self.db.commit()
            except Exception as e:
                print(e)
                self.db.rollback()
            return item
    
    • settings.py
    BOT_NAME = 'dangdang'
    SPIDER_MODULES = ['dangdang.spiders']
    NEWSPIDER_MODULE = 'dangdang.spiders'
    ROBOTSTXT_OBEY = False
    ITEM_PIPELINES = {
       'dangdang.pipelines.DangdangPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    scrapy.cmdline.execute('scrapy crawl dangdang_spider'.split())
    

    Scrapy进阶

    • Scrapy Shell

    • Selectors选择器

      • Scrapy Selectors 内置 XPath 和 CSS Selector 表达式机制
      • Selector四个基本的方法(xpath最常用)
        • xpath()
          • 传入xpath表达式,返回该表达式所对应的所有节点的selector list列表
        • extract()
          • 序列化该节点为Unicode字符串并返回list, extract_first()
        • css()
          • 传入CSS表达式,返回该表达式所对应的所有节点的selector list列表,语法同 BeautifulSoup4中soup.select()
        • re()
          • 根据传入的正则表达式对数据进行提取,返回Unicode字符串list列表
    • Spider类

      • 概述
        • Spider类定义了如何爬取某个(或某些)网站。包括了爬取的动作(例如:是否跟进链接)以及如何从网页的内容中提取结构化数据(爬取item)。 换句话说,Spider就是你定义爬取的动作及分析某个网页(或者是有些网页)的地方
        • scrapy.Spider是最基本的类,所有编写的爬虫必须继承这个类
      • 主要用到的函数及调用顺序
        • _init_()
          • 初始化爬虫名字和start_urls列表
        • start_requests()
          • 调用make_requests_from_url():生成Requests对象交给Scrapy下载并返回response
        • parse(self, response)
          • 解析response,并返回Item或Requests(需指定回调函数)
          • Item传给Item pipline持久化,而Requests交由Scrapy下载,并由指定的回调函数处理(默认parse()),一直进行循环,直到处理完所有的数据为止
      • 主要属性和方法
        • name
          • 定义spider名字的字符串。唯一
        • allowed_domains
          • 包含了spider允许爬取的域名(domain)的列表,可选
        • start_urls
          • 初始URL元组/列表。当没有指定特定的URL时,spider将从该列表中开始进行爬取
        • start_requests(self)
          • 该方法必须返回一个可迭代对象(iterable)。该对象包含了spider用于爬取(默认实现是使用 start_urls 的url)的第一个Request
          • 当spider启动爬取并且未指定start_urls时,该方法被调用
        • parse(self, response)
          • 当请求url返回网页没有指定回调函数时,默认的Request对象回调函数。用来处理网页返回的response,以及生成Item或者Request对象
        • log(self, message[, level, component])
          • 使用 scrapy.log.msg() 方法记录日志信息
    • CrawlSpider类

      • 概述
        • CrawlSpider是Spider的派生类
        • Spider类的设计原则是只爬取start_urls列表中的网页
        • CrawlSpider类定义了一些规则(rule)来提供跟进link的方便的机制,从爬取的网页中获取link并继续爬取的工作更适合
          • 自动翻页
      • LinkExtractors
        • 概述
          • 使用LinkExtractors 的目的: 提取链接
          • 每个LinkExtractor有唯一的公共方法是 extract_links(),它接收一个 Response 对象,并返回一个 scrapy.link.Link 对象
        • 主要参数
          • allow
            • 满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配
          • deny
            • 与这个正则表达式(或正则表达式列表)匹配的URL一定不提取
          • allow_domains
            • 会被提取的链接的domains/域名
          • deny_domains
            • 一定不会被提取链接的domains
          • restrict_xpaths
            • 使用xpath表达式,和allow共同作用过滤链接/范围
      • rules
        • 概述
          • 在rules中包含一个或多个Rule对象,每个Rule对爬取网站的动作定义了特定操作
          • 如果多个Rule匹配了相同的链接,则根据规则在本集合中被定义的顺序,第一个会被使用
        • 主要参数
          • link_extractor
            • 是一个Link Extractor对象,用于定义需要提取的链接
          • callback
            • 从link_extractor中每获取到链接时,参数所指定的值作为回调函数,该回调函数接受一个response作为其第一个参数(尽量避免使用parse)
          • follow
            • 是一个布尔(boolean)值,指定了根据该规则从response提取的链接是否需要跟进
            • follow=True
              • 跟随:会自动匹配子网页中的其他符合规则的链接并爬取
          • process_links
            • 指定该spider中哪个的函数将会被调用,从link_extractor中获取到链接列表时将会调用该函数
            • 该方法主要用来过滤
          • process_request
            • 指定该spider中哪个的函数将会被调用, 该规则提取到每个request时都会调用该函数(用来过滤request)
    • Robots协议

      • 概述
        • Robots协议(也称为爬虫协议、机器人协议等)的全称是“网络爬虫排除标准”(Robots Exclusion Protocol),网站通过Robots协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取
        • robots.txt文件是一个文本文件。当一个搜索蜘蛛访问一个站点时,它会首先检查该站点根目录下是否存在robots.txt,如果存在,搜索机器人就会按照该文件中的内容来确定访问的范围;如果该文件不存在,所有的搜索蜘蛛将能够访问网站上所有没有被口令保护的页面
      • 使用
        • 禁止robots协议将 ROBOTSTXT_OBEY = True改为False
    • 深度爬取

      • 爬取到链接,进入链接继续爬取,爬取到链接,再次进入链接爬取......

      • yield scrapy.Request(
                        url=href,  # url链接
                        callback=self.parse_detail,  # 回调函数:请求成功后的响应
                        meta={'name': name}  # 传入到parse_detail中的数据
                    )
        
        • scrapy.Request异步爬取
      • name = response.meta['name']

        • 取出小说名
        • 逐级传递
      • yield BiqugeItem(name=name, zj_name=zj_name, zj_content=zj_content)

        • 将数据传入管道
    • 循环遍历实现翻页

      • # 爬取下一页
        if self.page <= 100:
            print(f'---开始爬取{self.page}页---')
            self.page = self.page + 1
            url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_%d.shtml' % self.page
            yield scrapy.Request(url, callback=self.parse)
        

    Scrapy爬取笔趣阁

    • biquege_spider.py
    import requests
    import scrapy
    from ..items import BiqugeItem
    
    
    class BiqugeSpiderSpider(scrapy.Spider):
        name = 'biquge_spider'
        allowed_domains = ['biquge5200.cc']
        start_urls = ['https://www.biquge5200.cc/xuanhuanxiaoshuo/']
        # 爬取笔趣阁的首页
        def parse(self, response, **kwargs):
            # 解析数据
            li_list = response.xpath('//div[@class="l"]/ul/li')
            for li in li_list:
                name = li.xpath('./span[@class="s2"]/a/text()').get()  # 小说名
                href = li.xpath('./span[@class="s2"]/a/@href').get()  # 小说链接
                # requests:同步
                # print(len(requests.get(href).text))
                # print('-' * 100)
                # 异步:scrapy.Request
                # 请求小说详情页
                yield scrapy.Request(
                    url=href,  # url链接
                    callback=self.parse_detail,  # 回调函数:请求成功后的响应
                    meta={'name': name}  # 传入到parse_detail中的数据
                )
        # 详情页
        def parse_detail(self, response):
            # 取出小说名
            name = response.meta['name']
            # 解析数据
            dd_list = response.xpath('//div[@id="list"]/dl/dd')
            for dd in dd_list:
                zj_name = dd.xpath('./a/text()').get()  # 章节名称
                zj_href = dd.xpath('./a/@href').get()  # 章节内容链接
                # 请求每个章节的小说内容
                yield scrapy.Request(
                    url=zj_href,
                    callback=self.parse_content,
                    meta={'name': name, 'zj_name': zj_name}
                )
        # 小说内容页
        def parse_content(self, response):
            # 取出小说名及章节名
            name = response.meta['name']
            zj_name = response.meta['zj_name']
            # 解析数据
            p_list = response.xpath('//*[@id="content"]/p/text()').getall()
            zj_content = '
    '.join(p_list)
            # item
            # 将数据传入管道
            yield BiqugeItem(name=name, zj_name=zj_name, zj_content=zj_content)
    
    • items.py
    import scrapy
    
    
    class BiqugeItem(scrapy.Item):
        name = scrapy.Field()
        zj_name = scrapy.Field()
        zj_content = scrapy.Field()
    
    • pipelines.py
    import os
    from itemadapter import ItemAdapter
    
    
    class BiqugePipeline:
        # def __init__(self):
        #     self.path = r'C:Users86188DesktopSpiderDay05scrapy_projectiqugeooks'
        def process_item(self, item, spider):
            if not os.path.isdir("books/%s" % item['name']):
                os.mkdir("books/%s" % item['name'])
            else:
                with open('books/%s/%s.txt' % (item["name"], item["zj_name"]), 'a', encoding='utf-8') as fp:
                    fp.write(item["zj_content"])
                    fp.flush()
                print(f'item:{item["name"]}-{item["zj_name"]}')
            return item
    
    • settings.py
    BOT_NAME = 'biquge'
    SPIDER_MODULES = ['biquge.spiders']
    NEWSPIDER_MODULE = 'biquge.spiders'
    USER_AGENT = 'biquge (+http://www.yourdomain.com)'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    ITEM_PIPELINES = {
       'biquge.pipelines.BiqugePipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    scrapy.cmdline.execute('scrapy crawl biquge_spider'.split())
    

    Scrapy爬取京东

    • jd_spider.py
    import scrapy
    from selenium import webdriver
    from ..items import JdItem
    
    
    '''
    常见的反爬虫策略之一。
    这个参数的值,表明你是从哪个网页跳转过来的。
    比如说我请求获得淘宝评论的时候,他的referer是商品详情页面,表明我从这件商品详情页请求的相关评论,没有referer就不会给你这个评论
    from fake_useragent import UserAgent
    #伪装成浏览器
    ua = UserAgent()
    headers = {'User-Agent':ua.random} #一般网站伪装成这样也就够了,但是如果想爬图片,图片反盗链的话。如下
    #其实很好理解,就是告诉你要下载的那个图片页面,我是从主页面来的,现在把数据给我。
    headers = {'User-Agent':ua.random,'Referer':'这里放入图片的主页面'}
    #然后在后续requests中传入header即可
    '''
    class JdSpiderSpider(scrapy.Spider):
        name = 'jd_spider'
        allowed_domains = ['jd.com']
        start_urls = [
            # 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=1&s=1&click=0',
            # 'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=4&s=79&scrolling=y&log_id=1600660067305.2410&tpl=3_M&isList=1&show_items=',
            'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=6&s=131&scrolling=y&log_id=1600661434422.8716&tpl=3_M&isList=1&show_items='
        ]
        page1 = 1
        # page2 = 2
        s1 = 1
        # s2 = 27
        def parse(self, response, **kwargs):
            # driver = webdriver.Chrome()
            # driver.execute_script('window.scrollBy(0,10000)')
            li_list = response.xpath('//li[@class="gl-item"]')
            print(len(li_list))
            for li in li_list:
                shoes_name = li.xpath('./div/div[@class="p-img"]/a/@title').get()
                shoes_price = li.xpath('./div/div[@class="p-price"]/strong/i/text()').get()
                shoes_picture = li.xpath('./div/div[@class="p-img"]/a/img/@data-lazy-img').get()
                print(shoes_name, shoes_price, shoes_picture)
                yield JdItem(shoes_name=shoes_name, shoes_price=shoes_price, shoes_picture=shoes_picture)
            # driver.close()
            # if self.page1 <= 10:
            # # if self.page2 <= 200:
            #     print(f'---开始爬取{self.page1}页---')
            #     # print(f'---开始爬取{self.page2}页---')
            #     self.page1 = self.page1 + 2
            #     self.s1 = self.s1 + 52
            #     # self.page2 = self.page2 + 2
            #     # self.s2 = self.s2 + 52
            #     url = f'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page={self.page1}&s={self.s1}&click=0'
            #     # url =  f'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page={self.page2}&s={self.s2}&scrolling=y&log_id=1600431181482.2679&tpl=3_M&isList=1&show_items='
            #
            #
            #     yield scrapy.Request(url, callback=self.parse)
    
    • items.py
    import scrapy
    
    
    class JdItem(scrapy.Item):
        shoes_name = scrapy.Field()
        shoes_price = scrapy.Field()
        shoes_picture = scrapy.Field()
    
    • pipelines.py
    import pymysql
    from itemadapter import ItemAdapter
    
    
    class JdPipeline:
        def open_spider(self, spider):
            print('连接数据库')
            self.db = pymysql.connect(
                user='root', password='******',database='spider2003'
            )
            self.cur = self.db.cursor()
        def close_spider(self, spider):
            print('关闭连接')
            self.cur.close()
            self.db.close()
        def process_item(self, item, spider):
            sql = 'insert into jd (shoes_name, shoes_price, shoes_picture) values ("%s", "%s", "%s")' % (item['shoes_name'], item['shoes_price'], item['shoes_picture'])
            try:
                self.cur.execute(sql)
                self.db.commit()
            except Exception as e:
                print(e)
                self.db.rollback()
            return item
    
    • settings.py
    BOT_NAME = 'jd'
    SPIDER_MODULES = ['jd.spiders']
    NEWSPIDER_MODULE = 'jd.spiders'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
      'referer': 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=3&s=53&click=0'
    }
    ITEM_PIPELINES = {
       'jd.pipelines.JdPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    # scrapy.cmdline.execute('scrapy crawl jd_spider --nolog'.split())
    scrapy.cmdline.execute('scrapy crawl jd_spider'.split())
    

    Scrapy爬取糗事百科

    • qsbk_spider.py
    import scrapy
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy.linkextractors import LinkExtractor
    from ..items import QiushibaikeItem
    # 导入日志模块
    import logging
    # 配置日志输出格式
    LOG_FORMAT = "%(asctime)s - %(levelname)s - %(module)s - %(message)s"  # 设置输出格式
    DATE_FORMAT = "%Y/%m/%d %H:%M:%S"  # 设置时间格式
    logging.basicConfig(filename='qsbk.log', filemode='a+', format=LOG_FORMAT, datefmt=DATE_FORMAT)
    
    
    class QsbkSpiderSpider(CrawlSpider):
    # class QsbkSpiderSpider(scrapy.Spider):
        name = 'qsbk_spider'
        allowed_domains = ['qiushibaike.com']
        start_urls = ['https://www.qiushibaike.com/text/page/1/']
        rules = [
            Rule(
                LinkExtractor(
                    allow=('/text/page/d+/',),
                    restrict_xpaths=('//ul[@class="pagination"]',)
                ),
                callback="parse_item",
                follow=True
            )
        ]
        def parse_item(self, response, **kwargs):
            div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
            for div in div_list:
                author = div.xpath('./div[@class="author clearfix"]/a[2]/h2/text()').get()
                content = div.xpath('./a[@class="contentHerf"]/div/span/text()').getall()  # 有br换行时,要用getall,但是要处理结果
                logging.info(f'download:{author}')
                yield QiushibaikeItem(author=author, content=content)
    
    • items.py
    import scrapy
    
    
    class QiushibaikeItem(scrapy.Item):
        author = scrapy.Field()
        content = scrapy.Field()
    
    • pipelines.py
    import os
    import random
    from itemadapter import ItemAdapter
    
    
    class QiushibaikePipeline:
        def process_item(self, item, spider):
            with open('cross_talk/%s-%f.txt' % (item['author'].replace('
    ', ''), random.random()), 'w', encoding='utf-8') as fp:
                fp.write((''.join(item['content'])).replace('
    ', ''))
                fp.flush()
            return item
    
    • settings.py
    BOT_NAME = 'qiushibaike'
    SPIDER_MODULES = ['qiushibaike.spiders']
    NEWSPIDER_MODULE = 'qiushibaike.spiders'
    USER_AGENT = 'qiushibaike (+http://www.yourdomain.com)'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    ITEM_PIPELINES = {
       'qiushibaike.pipelines.QiushibaikePipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    # scrapy.cmdline.execute('scrapy crawl qsbk_spider --nolog'.split())
    scrapy.cmdline.execute('scrapy crawl qsbk_spider'.split())
    

    Scrapy爬取新浪新闻

    • news_spider.py
    import scrapy
    from ..items import SinaNewsItem
    
    
    class NewsSpiderSpider(scrapy.Spider):
        name = 'news_spider'
        allowed_domains = ['sina.com.cn']
        start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_1.shtml']
        # 自定义类属性
        page = 1
        def parse(self, response, **kwargs):
            li_list = response.xpath('//ul[@class="list_009"]/li')
            for li in li_list:
                news = li.xpath('./a/text()').get()
                news_time = li.xpath('./span/text()').get()
                news_link = li.xpath('./a/@href').get()
    
                item = SinaNewsItem(
                    news=news,
                    news_time=news_time,
                    news_link=news_link,
                )
                yield item
            # 爬取下一页
            if self.page <= 100:
                print(f'---开始爬取{self.page}页---')
                self.page = self.page + 1
                url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_%d.shtml' % self.page
                yield scrapy.Request(url, callback=self.parse)
    
    • items.py
    import scrapy
    
    
    class SinaNewsItem(scrapy.Item):
        news = scrapy.Field()
        news_time = scrapy.Field()
        news_link = scrapy.Field()
    
    • pipelines.py
    import pymysql
    from itemadapter import ItemAdapter
    
    
    class SinaNewsPipeline:
        def open_spider(self, spider):
            print('开始爬取')
            self.db = pymysql.connect(
                host='localhost',
                port=3306,
                user='root',
                password='******',
                database='spider2003',
                charset='utf8'
            )
            self.cur = self.db.cursor()
        def close_spider(self, spider):
            print('爬取结束')
            self.cur.close()
            self.db.close()
        def process_item(self, item, spider):
            news = item['news']
            news_time = item['news_time']
            news_link = item['news_link']
            try:
                sql = 'insert into sina_news(news, news_time, news_link) values ("%s", "%s", "%s")' % (news, news_time, news_link)
                self.cur.execute(sql)
                self.db.commit()
            except Exception as e:
                print(e)
                self.db.rollback()
            return item
    
    • settings.py
    BOT_NAME = 'sina_news'
    SPIDER_MODULES = ['sina_news.spiders']
    NEWSPIDER_MODULE = 'sina_news.spiders'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    ITEM_PIPELINES = {
       'sina_news.pipelines.SinaNewsPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    scrapy.cmdline.execute('scrapy crawl news_spider'.split())
    

    Scrapy高级

    • 日志logging

      • Scrapy提供的log功能

        • 可以修改配置文件settings.py,任意位置添加下面两行,效果会清爽很多

          • LOG_ENABLED = True  # 开启
            LOG_FILE = "mySpider.log" #日志文件名
            LOG_LEVEL = "INFO" #日志级别
            
        • Log levels

          • Scrapy提供5层logging级别
            • CRITICAL - 严重错误(critical)
            • ERROR - 一般错误(regular errors)
            • WARNING - 警告信息(warning messages)
            • INFO - 一般信息(informational messages)
            • DEBUG - 调试信息(debugging messages)
        • logging设置

          • 通过在setting.py中进行以下设置可以被用来配置logging
            • LOG_ENABLED
              • 默认: True,启用logging
            • LOG_ENCODING
              • 默认: 'utf-8',logging使用的编码
            • LOG_FILE
              • 默认: None,在当前目录里创建logging输出文件的文件名
            • LOG_LEVEL
              • 默认: 'DEBUG',log的最低级别
          • scrapy的日志模块已经被scrapy弃用
      • 使用python自带日志模块

        • import logging
          LOG_FORMAT = "%(asctime)s - %(levelname)s - %(module)s - %(message)s"  # 设置输出格式
          DATE_FORMAT = "%Y/%m/%d %H:%M:%S"  # 设置时间格式
          logging.basicConfig(filename='sina.log', filemode='a+', format=LOG_FORMAT, datefmt=DATE_FORMAT)
          logging.warning('错误')
          
    • settings配置

      • 概述

        • Scrapy设置(settings)提供了定制Scrapy组件的方法。可以控制包括核心(core),插件(extension),pipeline及spider组件
      • 设置

        • BOT_NAME

          • 默认: 'scrapybot'
          • Scrapy项目实现的bot的名字(也为项目名称)。 这将用来构造默认 User-Agent,同时也用来log
          • 当您使用startproject命令创建项目时其也被自动赋值
        • CONCURRENT_ITEMS

          • 默认: 100
          • Item Processor(即 Item Pipeline同时处理每个response(item)的最大值
        • CONCURRENT_REQUESTS

          • 默认: 16
          • Scrapy downloader 并发请求(concurrent requests)的最大值
        • CONCURRENT_REQUESTS_PER_DOMAIN

          • 默认: 8
          • 对单个网站进行并发请求的最大值
        • CONCURRENT_REQUESTS_PER_IP

          • 默认: 0
          • 对单个IP进行并发请求的最大值
          • 如果非0,则忽略CONCURRENT_REQUESTS_PER_DOMAIN,设定, 使用该设定
        • DEFAULT_REQUEST_HEADERS

          • DEFAULT_REQUEST_HEADERS = {
              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
              'Accept-Language': 'en',
            }
            
          • Scrapy HTTP Request使用的默认header,由DefaultHeadersMiddleware产生

        • DEPTH_LIMIT

          • 默认: 0
          • 爬取网站最大允许的深度(depth)值。如果为0,则没有限制
        • DOWNLOADER

          • 默认: 'scrapy.core.downloader.Downloader'
          • 用于crawl的downloader
        • DOWNLOADER_MIDDLEWARES

          • 默认:{}
          • 保存项目中启用的下载中间件及其顺序的字典
        • DOWNLOADER_MIDDLEWARES_BASE

          • 默认{...}
          • 包含Scrapy默认启用的下载中间件的字典。 永远不要在项目中修改该设定
        • DOWNLOAD_DELAY

          • 默认: 0下载器在下载同一个网站下一个页面前需要等待的时间。该选项可以用来限制爬取速度, 减轻服务器压力。同时也支持小数
        • DOWNLOAD_TIMEOUT

          • 默认: 180
          • 下载器超时时间(单位: 秒)
        • ITEM_PIPELINES

          • 默认: {}
          • 保存项目中启用的pipeline及其顺序的字典。该字典默认为空,值(value)任意。 不过值(value)习惯设定在0-1000范围内,越小,优先级越高
        • ITEM_PIPELINES_BASE

          • 默认: {}
          • 保存项目中默认启用的pipeline的字典。 永远不要在项目中修改该设定,而是修改ITEM_PIPELINES
        • LOG_ENABLED

          • 默认:True
          • 是否启用logging
        • LOG_ENCODING

          • 默认: 'utf-8'
          • logging使用的编码
        • LOG_FILE

          • 默认: None
          • logging输出的文件名。如果为None,则使用标准错误输出(standard error)
        • LOG_LEVEL

          • 默认: 'DEBUG'
          • 可选的级别有: CRITICAL、 ERROR、WARNING、INFO、DEBUG
        • LOG_STDOUT

          • 默认: False
          • 如果为 True ,进程所有的标准输出(及错误)将会被重定向到log中。例如, 执行 print 'hello' ,其将会在Scrapy log中显示
        • REDIRECT_MAX_TIMES

          • 默认: 20
          • 定义request允许重定向的最大次数。超过该限制后该request直接返回获取到的结果。 对某些任务我们使用Firefox默认值
        • ROBOTSTXT_OBEY

          • 默认:True
          • 如果启用,Scrapy将会遵守 robots.txt策略
        • SCHEDULER

          • 默认:{}
          • 保存项目中启用的下载中间件及其顺序的字典
        • SPIDER_MIDDLEWARES_BASE

          • 默认:{...}
          • 保存项目中默认启用的spider中间件的字典。 永远不要在项目中修改该设定,而是修改SPIDER_MIDDLEWARES
        • SPIDER_MODULES

          • 默认: []
          • Scrapy搜索spider的模块列表
        • URLLENGTH_LIMIT

          • 默认: 2083
          • 爬取URL的最大长度
        • USER_AGENT

          • 默认: "Scrapy/VERSION (+http://scrapy.org)"
          • 爬取的默认User-Agent,除非被覆盖
        • REACTOR_THREADPOOL_MAXSIZE

          • 线程池数量,默认10条
    • 自定义中间件

      • 中间件种类

        • process_request(self, request, spider)

          • 当每个request通过下载中间件时,该方法被调用
        • process_response(self, request, response, spider)

          • 当下载器完成http请求,传递响应给引擎的时候调用
        • 自定义

          • 创建中间件类

          • # 随机的User-Agent
            class RandomUserAgent(object):
                def process_request(self, request, spider):
                    useragent = random.choice(USER_AGENTS)
                    request.headers.setdefault("User-Agent", useragent)
            
          • # 随机代理IP
            class RandomProxy(object):
                def process_request(self, request, spider):
                    proxy = random.choice(PROXIES)
                    request.meta['proxy'] = "http://" + proxy['ip_port']
            
          • 配置中间件

            • 最后设置setting.py里的DOWNLOADER_MIDDLEWARES,添加自己编写的下载中间件类

            • DOWNLOADER_MIDDLEWARES = {
                 'baidu.middlewares.BaiduDownloaderMiddleware': 543,
                 # 配置中间件
                 'baidu.middlewares.UADownloaderMiddleware': 300,
                 'baidu.middlewares.ProxyDownloaderMiddleware': 200,
              }
              
              USER_AGENTS = [
                  "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                  "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                  "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                  "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                  "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                  "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                  "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                  "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
              ]
              
              PROXIES = [
                  {'ip_port': '58.218.200.214:8730'},
                  {'ip_port': '58.218.200.247:2359'},
                  {'ip_port': '58.218.200.248:8503'},
                  {'ip_port': '58.218.200.229:4612'},
                  {'ip_port': '58.218.200.214:5570'},
                  {'ip_port': '58.218.200.214:8801'},
              ]
              
    • POST请求

      • 如果第一个请求是post

        • 需要注释掉start_urls属性,并重写start_request方法

        • def start_requests(self):
              yield scrapy.FormRequest(
                      url='http://fanyi.baidu.com/sug',
                      formdata={'kw': 'wolf'},
                      callback=self.parse_item
                  )
          
      • 如果第一个请求不是post

        • response = requests.post("http://www.baidu.com/", data = data, headers=headers)
          yield  scrapy.FormRequest(url=url,formdata=data,callback=self.parse_item)	
          

    爬取新片场(综合)

    • xpc_spider.py
    import scrapy
    from ..items import *
    
    
    class XpcSpiderSpider(scrapy.Spider):
        name = 'xpc_spider'
        allowed_domains = ['xinpianchang.com']
        start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
        def parse(self, response, **kwargs):
            # 解析数据
            # 视频列表数据
            li_list = response.xpath('//ul[@class="video-list"][1]/li')
            for li in li_list:
                # 作品id
                pid = li.xpath('./@data-articleid').get()
                # 作品标题
                title = li.xpath('./div/div[1]/a/p/text()').get()
                # 缩略图
                thumbnail = li.xpath('./a/img/@_src').get()
                category_list = li.xpath('.//div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').getall()
                # 分类
                category = '|'.join(category_list)
                category = category.replace(' ', '').replace('
    ', '').replace('	', '')
                # 发布时间
                created_at = li.xpath('.//p[@class="fs_12"]/text()').get()
                # item
                item = PostsItem()
                item['pid'] = pid
                item['title'] = title
                item['thumbnail'] = thumbnail
                item['category'] = category
                item['created_at'] = created_at
                # 进入详情页
                post_url = f'https://www.xinpianchang.com/a{pid}?from=ArticleList'
                request = scrapy.Request(url=post_url, callback=self.post_detail)
                request.meta['post_item'] = item
                yield request
        # 作品详情页
        def post_detail(self, response):
            post_item = response.meta.get('post_item')
            pid = post_item['pid']
            # 解析数据
            # 作品描述
            description_list = response.xpath('//p[@class="desc line-hide fs_14 c_b_3 fw_300 line-hide-3"]/text()').getall()
            description = ''.join(description_list)
            description = description.replace(' ', '').replace('
    ', '').replace('	', '')
            post_item['description'] = description
            # 播放次数
            play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
            post_item['play_counts'] = play_counts
            # 点赞次数
            like_counts = response.xpath('//span[@class="v-center like-counts fs_12 c_w_f fw_300"]/@data-counts').get()
            post_item['like_counts'] = like_counts
            # 视频数据
            # video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/ryM1l4365Wzwod2V?appKey=61a2f329348b3bf77&extend=userInfo%2CuserStatus'
            vid = response.xpath('//a[@class="collection-star hollow-star"]/@data-vid').get()
            video_url = f'https://mod-api.xinpianchang.com/mod/api/v2/media/{vid}?appKey=61a2f329348b3bf77&extend=userInfo%2CuserStatus'
            # 请求视频数据
            request = scrapy.Request(url=video_url, callback=self.video_detail)
            request.meta['post_item'] = post_item
            yield request
            # 创作者数据
            li_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul/li')
            for li in li_list:
                # 创作者id
                cid = li.xpath('./a/@data-userid').get()
                # item
                composer_item = ComposersItem()
                composer_item['cid'] = cid
                # 创作者url
                composer_url = li.xpath('./a/@href').get()
                composer_url = 'https://www.xinpianchang.com/' + composer_url
                # 访问创作者详情页
                request2 = scrapy.Request(url=composer_url, callback=self.composer_detail)
                request2.meta['composer_item'] = composer_item
                yield request2
                # 版权/角色数据
                cr_item = CopyrightsItem()
                cr_item['pcid'] = f'{pid}_{cid}'
                cr_item['pid'] = pid
                cr_item['cid'] = cid
                cr_item['roles'] = li.xpath('.//span[@class="roles fs_12 fw_300 c_b_9"]/text()').get()
                yield cr_item
            # 评论数据
            comment_url = f'https://app.xinpianchang.com/comments?resource_id={pid}&type=article&page=1&per_page=24'
            yield scrapy.Request(
                url=comment_url,
                callback=self.comment_detail
            )
        # 视频数据
        def video_detail(self, response):
            post_item = response.meta.get('post_item')
            # 解析数据
            content = response.json()
            # 视频预览图
            preview = content['data']['cover']
            # 视频链接
            video = content['data']['resource']['progressive'][0]['url']
            # 视频格式
            video_format = content['data']['resource']['progressive'][0]['mime']
            # 视频时长
            duration = content['data']['duration']
            # item
            post_item['preview'] = preview
            post_item['video'] = video
            post_item['video_format'] = video_format
            post_item['duration'] = duration
            # print(post_item)
            yield post_item
            # 创作者详情页
        def composer_detail(self, response):
            composer_item = response.meta.get('composer_item')
            # banner图
            banner = response.xpath('//div[@class="banner-wrap"]/@style').get()
            banner = banner[banner.find('(')+1: -1]
            # 用户头像
            avatar = response.xpath('//div[@class="banner-wrap"]/div/span/img/@src').get()
            # 是否加V
            verified = response.xpath('//div[@class="banner-wrap"]/div/span/span[contains(@class, "author-v")]').get()
            verified = 'yes' if verified else 'no'
            # 名字
            name = response.xpath('//p[@class="creator-name fs_26 fw_600 c_b_26"]/text()').get()
            # 自我介绍
            intro = response.xpath('//p[@class="creator-desc fs_14 fw_300 c_b_3 line-hide-1"]/text()').get()
            # 被点赞次数
            like_counts = response.xpath('//span[@class="like-counts fw_600 v-center"]/text()').get()
            like_counts = like_counts.replace(',', '')
            # 被关注数量
            fans_counts = response.xpath('//span[@class="fans-counts fw_600 v-center"]/text()').get()
            fans_counts = fans_counts.replace(',', '')
            # 关注数量
            follow_counts = response.xpath('//span[@class="follow-wrap"]/span[@class="fw_600 v-center"]/text()').get()
            follow_counts = follow_counts.replace(',', '')
            # 所在位置
            location = response.xpath('//span[@class="icon-location v-center"]/following-sibling::*/text()').get()
            location = location if location else ''
            # 职业
            career = response.xpath('//span[@class="icon-career v-center"]/following-sibling::*/text()').get()
            career = career if career else ''
            # item
            composer_item['banner'] = banner
            composer_item['avatar'] = avatar
            composer_item['verified'] = verified
            composer_item['name'] = name
            composer_item['intro'] = intro
            composer_item['like_counts'] = like_counts
            composer_item['fans_counts'] = fans_counts
            composer_item['follow_counts'] = follow_counts
            composer_item['location'] = location
            composer_item['career'] = career
            yield composer_item
        # 评论数据
        def comment_detail(self, response):
            content = response.json()
            comment_list = content['data']['list']
            for comment in comment_list:
                # 评论其他评论的数量
                reply = comment.get('referer')
                if reply:
                    reply = reply.get('id')
                else:
                    reply = 0
                item = CommentsItem(
                    commentid=comment['id'],
                    pid=comment['resource_id'],
                    cid=comment['userid'],
                    avatar=comment['userInfo']['avatar'],
                    uname=comment['userInfo']['username'],
                    created_at=comment['addtime'],
                    content=comment['content'],
                    like_counts=comment['count_approve'],
                    reply=reply
                )
                yield item
    
    • items.py
    from scrapy import Item, Field
    
    
    # 作品
    class PostsItem(Item):
        table_name = 'posts'  # 表名
        pid = Field()
        title = Field()
        thumbnail = Field()
        preview = Field()
        video = Field()
        video_format = Field()
        category = Field()
        duration = Field()
        created_at = Field()
        description = Field()
        play_counts = Field()
        like_counts = Field()
    class ComposersItem(Item):
        table_name = 'composers'  # 表名
        cid = Field()
        banner = Field()
        avatar = Field()
        verified = Field()
        name = Field()
        intro = Field()
        like_counts = Field()
        fans_counts = Field()
    
        follow_counts = Field()
        location = Field()
        career = Field()
    class CommentsItem(Item):
        table_name = 'comments'  # 表名
        commentid = Field()
        pid = Field()
        cid = Field()
        avatar = Field()
        uname = Field()
        created_at = Field()
        content = Field()
        like_counts = Field()
        reply = Field()
    # 版权:作者在作品中的角色
    class CopyrightsItem(Item):
        table_name = 'copyrights'  # 表名
        pcid = Field()
        pid = Field()
        cid = Field()
        roles = Field()
    
    • pipelines.py
    import pymysql
    from itemadapter import ItemAdapter
    
    
    class XpcPipeline:
        def open_spider(self, spider):
            print('---开始存入MySQL---')
            self.db = pymysql.connect(user='root', password='nzw19940611', database='xpc_2020')
            self.cur = self.db.cursor()
        def close_spider(self, spider):
            print('---存入MySQL结束---')
            self.cur.close()
            self.db.close()
        def process_item(self, item, spider):
            # 表名
            table_name = item.table_name
            keys = list(item.keys())
            values = list(item.values())
            # 所有字段组成的字符串
            key_str = ','.join(["`%s`" % key for key in keys])
            # 所有的值组成的字符串
            # value_str = ','.join(['"%s"' % value for value in values])
            value_str = ','.join(["%s"] * len(values))
            # 如果key冲突,则用新数据更新旧数据
            update_str = ','.join(["`{}`=%s".format(key) for key in keys])
            # sql
            sql = 'insert into `{}` ({}) values ({}) on duplicate key update {}'.format(
                table_name,
                key_str,
                value_str,
                update_str
            )
            # 执行sql
            self.cur.execute(sql, values*2)
            self.db.commit()
            print(f'---插入成功:{table_name}---')
            return item
    
    • settings.py
    BOT_NAME = 'xpc'
    SPIDER_MODULES = ['xpc.spiders']
    NEWSPIDER_MODULE = 'xpc.spiders'
    USER_AGENT = 'xpc (+http://www.yourdomain.com)'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    ITEM_PIPELINES = {
       'xpc.pipelines.XpcPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    scrapy.cmdline.execute('scrapy crawl xpc_spider --nolog'.split())
    # scrapy.cmdline.execute('scrapy crawl xpc_spider'.split())
    
  • 相关阅读:
    分页查询
    PDO
    投票
    租房子
    PHP增删改查
    PHP数据访问
    PHP三大特性-继承
    PHP三大特性-封装
    PHP面向对象
    循环语句(2)
  • 原文地址:https://www.cnblogs.com/lotuslaw/p/14665385.html
Copyright © 2011-2022 走看看