zoukankan      html  css  js  c++  java
  • 1-由浅入深学爬虫

    爬虫

    爬虫入门

    urllib

    from urllib import request
    
    
    url = 'http://www.baidu.com'
    # User-Agent: 模拟浏览器,防止服务器反爬
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    # 使用request发送请求
    # 创建请求对象
    req = request.Request(url=url, headers=headers)
    # 发送请求
    response = request.urlopen(req)
    # 响应数据
    # print(response.read())  # 二进制
    print(response.read().decode())  # 解码,得到字符串
    # print(response.info())  # 响应信息
    # print(response.status)  # 状态码
    

    urllib模拟百度搜索

    from urllib import request
    from urllib import parse
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    
    def baidu_search(params, key):
        # 百度搜索url
        url = f'https://www.baidu.com/s?{params}'
        # 发送请求
        req =request.Request(url, headers=headers)
        res = request.urlopen(req)
        content =res.read().decode()
        print(content)
        # 保存爬取的数据
        with open(f'{key}.html', 'w', encoding='utf-8') as fp:
            fp.write(content)
            fp.flush()
    '''
    如果向服务器发送数据,那么data参数必须是一个有数据的bytes对象,否则为None。HTTP请求使用POST方法时,data必须有数据;使用GET方法时,data写成None
    data = bytes(parse.urlencode({"pro": "value"}, encoding="utf8"))
    response = request.urlopen("http://www.baidu.com", data=data)
    '''
    
    
    if __name__ == '__main__':
        key = input('请输入要搜索的内容')
        params ={'wd': key}
        params = parse.urlencode(params)  # 解决url中出现中文的问题
        # print(params)  # wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
        baidu_search(params, key)
    

    urllib爬取51job

    import re
    from urllib import request
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    # url
    url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,Python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    # 发送请求
    req = request.Request(url, headers=headers)
    res = request.urlopen(req)
    # 获取数据
    content = res.read().decode('gbk')
    # 使用正则
    pattern = '"jobid_count":"(.*?)"'  # 捕获
    result = re.findall(pattern, content, re.S)  # 让.可以匹配换行
    print(result)
    

    urllib下载图片

    from urllib import request
    
    
    # 下载图片
    request.urlretrieve(
        url='https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1600067380374&di=16addb0b6e336ab847a1403cebc09a43&imgtype=0&src=http%3A%2F%2Fgss0.baidu.com%2F-vo3dSag_xI4khGko9WTAnF6hhy%2Fzhidao%2Fpic%2Fitem%2Fb17eca8065380cd72cbb313da744ad34588281bd.jpg',
        filename='人民币.png'
    )
    request.urlcleanup()  # 清理缓存
    

    urllib爬取豆瓣电影

    import json
    from urllib import request
    import pymysql
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=0'
    req = request.Request(url, headers=headers)
    res = request.urlopen(req)
    # json解析:json反序列化
    # json一定要用双引号
    # 不能在json中注释
    content = res.read().decode()
    result = json.loads(content)
    '''
    # 1.将电影数据存入本地txt文件
    movie_list = result['data']
    for movie in movie_list:
        title = movie['title']
        url = movie['url']
        with open('douban.txt', 'a', encoding='utf-8') as fp:
            s = str((title, url)) + '
    '
            fp.write(s)
            fp.flush()
    '''
    # 2.将电影数据存储到MySQL
    # 连接MySQL
    db = pymysql.connect(
        host='localhost', port=3306,
        user='root', password='nzw19940611',
        database='spider2003', charset='utf8mb4'
    )
    cur = db.cursor()  # 游标:执行SQL
    # 执行SQL
    movie_list = result['data']
    for movie in movie_list:
        title = movie['title']
        url = movie['url']
        try:
            # sql
            sql = 'insert into tb_douban_movie(movie_title, url) values("%s", "%s")' % (title, url)
            cur.execute(sql)
            db.commit()  # 事务提交
        except Exception as e:
            print('插入失败:', e)
            db.rollback()  # 回滚
    print('--插入MySQL完成--')
    # content = eval(res.read().decode())
    # for i in range(len(content['data'])):
    #     with open('豆瓣.txt', 'a', encoding='utf-8') as fp:
    #         fp.write(content['data'][i]['title']+'
    ')
    #         fp.flush()
    

    urllib使用代理IP

    import random
    from urllib import request
    import json
    
    
    # 先获取芝麻代理ip
    url = 'http://http.tiqu.alicdns.com/getip3?num=10&type=2&pro=0&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions=&gm=4'
    # 请求芝麻代理API
    response = request.urlopen(url)
    content = response.read().decode()
    # print(content)
    # json解析,提取ip和port
    result = json.loads(content)
    ip_list = result['data']
    # 把ip格式化后存入proxy_list
    proxy_list = []
    for ip in ip_list:
        ip_dict = {
            'HTTP': f'{ip["ip"]}:{ip["port"]}'
        }
        proxy_list.append(ip_dict)
    # print(proxy_list)  # {'http': 'http://58.218.92.13:6905'}......
    # url = UserAgent池
    UserAgentList = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",
        "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
        "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36"
    ]
    # 获取随机的代理IP
    proxy = random.choice(proxy_list)
    # 随机的UA
    ua = random.choice(UserAgentList)
    # 使用代理IP和UA
    proxy_handler = request.ProxyHandler(proxies=proxy)  # 构建一个代理
    opener = request.build_opener(proxy_handler)  # 使用构建的代理创建一个opener对象
    # 发送请求
    req = request.Request('http://www.baidu.com')
    req.add_header('User-Agent', ua)  # 随机的ua
    # 使用带代理的opener对象打开某个url/request
    response = opener.open(req)  # 等价于request.urlopen()
    res = response.read().decode()
    print(res)
    

    requests基础

    import requests
    
    
    # get请求
    '''
    response = requests.get('http://www.baidu.com')
    # print(response)  # <Response [200]>
    print(response.text)  # 默认使用utf-8解码,内容字符串
    print(response.content)  # 二进制
    # print(response.json())  # json解析
    
    # print(response.headers)  # 头部信息
    # print(response.cookies)  # 响应的cookie
    # print(response.status_code)  # 状态码
    '''
    
    '''
    # get请求:百度搜索
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    url = 'https://www.baidu.com/s?wd=hello'
    response = requests.get(url, headers=headers)
    print(response.text)
    '''
    
    # post请求:有道翻译
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
    kw = input('请输入要翻译的单词:')
    # data是post的参数
    data = {
        "i": kw,
        "from": "AUTO",
        "to": "AUTO",
        "smartresult": "dict",
        "client": "fanyideskweb",
        "salt": "16000738465941",
        "sign": "bf2e220fb6fe0ec8e03524a390dc0b5c",
        "lts": "1600073846594",
        "bv": "e915c77f633538e8cf44c657fe201ebb",
        "doctype": "json",
        "version": "2.1",
        "keyfrom": "fanyi.web",
        "action": "FY_BY_CLICKBUTTION"
    }
    response = requests.post(url, data=data, headers=headers)
    result = response.json()  # json解析,解析成字典
    src = result['translateResult'][0][0]['src']
    tgt = result['translateResult'][0][0]['tgt']
    print(src, tgt)
    

    bs4和xpath

    requests使用代理

    import random
    import requests
    
    
    '''
    58.218.200.228:9150
    58.218.200.223:4432
    58.218.200.226:8256
    58.218.200.228:7837
    58.218.200.223:8915
    '''
    # proxy
    proxy_list = [
        {"HTTP": "58.218.200.228:9150"},
        {"HTTP": "58.218.200.223:4432"},
        {"HTTP": "58.218.200.226:8256"},
        {"HTTP": "58.218.200.228:7837"},
        {"HTTP": "58.218.200.223:8915"}
    ]
    # 获取随机代理IP
    proxy = random.choice(proxy_list)
    # 使用代理
    res = requests.get('http://www.baidu.com', proxies=proxy)
    print(res.text)
    

    requests使用session

    import requests
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    url = 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=1&s=1&click=0'
    # 使用session
    session = requests.session()
    # 使用session发送请求:保持会话,存储cookie
    response = session.get(url, headers=headers)
    print(response.text)
    # 当继续使用session访问其他url时,会自动携带之前的cookie
    url2 = 'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=2&s=27&scrolling=y&log_id=1600483717480.6970&tpl=3_M&isList=1&show_items='
    response2 = session.get(url2, headers=headers)
    print(response2.text)
    

    requests使用cookies

    import requests
    
    
    url = 'http://www.baidu.com'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    cookies = {
        "PSTM": "1600136817",
        "BDRCVFR[feWj1Vr5u3D]": "mk3SLVN4HKm",
        "BAIDUID": " E922D90277D06E37B8B783C0082C650A:FG=1",
        "delPer": "0",
        "BD_CK_SAM": "1",
        "PSINO": "6",
        "H_PS_PSSID": "7506_32606_1424_7605_32116_31709_26350",
        "BIDUPSID": "89E6649E57A3DC9DABE613D88595BA0D",
        "BDORZ": "B490B5EBF6F3CD402E515D22BCDA1598",
        "BD_UPN": "12314753",
        "COOKIE_SESSION": "16_0_2_5_3_11_0_0_0_2_0_0_67596_0_0_0_1600136510_0_1600136818%7C5%230_0_1600136818%7C1",
        "H_PS_645EC": "3fcbYEWAxGp5VGowaCXsud%2BK436DuYp%2Bu6fs%2FUwAz9UFcCyuSSHqbS7CSMLQBpsMjeN%2F"
    }
    response = requests.get(url, headers=headers, cookies=cookies)
    # print(response.text)
    # print(response.cookies)
    # 将服务器返回的cookiejar,转换成字典dict
    cookie_dict = requests.utils.dict_from_cookiejar(response.cookies)
    print(cookie_dict)
    

    bs4基本用法

    from bs4 import BeautifulSoup
    # 安装Beautifulsop4
    # pip install Beacutifulsoup4
    # 安装HTML解析器lxml
    
    
    html_doc = """
    <html>
        <head>
            <title>呵呵</title>
        </head>
        <body>
        <p class="title">
            <b>哈哈</b>
        </p>
        <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="first" class="sister" id="link1">first</a>,
        <a href="second" class="sister" id="link2">second</a> and
        <a href="third" class="sister" id="link3">third</a>;
        </p>
        <p class="story">end</p>
        </body>
    </html>
    """
    # 使用bs4
    # 创建bs4对象
    soup = BeautifulSoup(html_doc, 'lxml')
    # print(soup)
    # print(type(soup))
    # tag标签
    # print(soup.head)
    # print(type(soup.head))  # <class 'bs4.element.Tag'>
    # print(soup.title)  # title标签
    # print(soup.b)  # 哈哈
    # print(soup.body.p.b)
    # attribute属性
    # print(soup.p.attrs)  # {'class': ['title']}第一个p所有属性
    # print(soup.a.attrs)  # {'href': 'first', 'class': ['sister'], 'id': 'link1'}第一个a的所有属性
    # print(soup.a.attrs['href'])  # 获取某个属性值
    # 文本内容,建议使用text
    # print(soup.b.string)  # 哈哈
    # print(soup.b.text)  # 哈哈
    # print(soup.p.string)  # None
    # print(soup.p.text)  # 哈哈
    # find_all():找到所有匹配的节点
    # print(soup.find_all('p'))  # 所有p节点
    # print(soup.find_all('p')[2])
    # 根据属性来查找
    # print(soup.find_all('p', attrs={'class': 'story'}))
    # print(soup.find_all('a', attrs={'id': 'link1'}))
    # print(soup.find_all('a', id='link1'))
    # print(soup.find_all('a', limit=2))  # 前两个a标签
    # print(soup.find_all(['a', 'b']))  # 找所有a标签和b标签
    # css选择器
    # soup.select()
    # print(soup.select('p'))  # 标签选择器
    # print(soup.select('#link2'))  # id选择器
    # print(soup.select('.sister'))  # class选择器
    # print(soup.select('p #link3'))  # 后代选择器
    # 从文档中获取所有文字内容
    print(soup.get_text())
    

    bs4解析股票基金数据

    import requests
    from bs4 import BeautifulSoup
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    # 股票网址
    url = 'http://quote.stockstar.com/fund/stock.shtml'
    response = requests.get(url, headers=headers)
    content = response.content.decode('gb2312')
    # print(content)
    # bs4解析网页内容
    soup = BeautifulSoup(content, 'lxml')
    tr_list = soup.select('#datalist tr')
    # print(tr_list)
    for tr in tr_list:
        s_code = tr.find_all('td')[0].a.text  # 基金代码
        s_name = tr.find_all('td')[1].a.text  # 基金名称
        s_unit = tr.find_all('td')[2].text  # 单位
        s = str((s_code, s_name, s_unit)) + '
    '
        with open('fund.txt', 'a', encoding='utf-8') as fp:
            fp.write(s)
            fp.flush()
    

    xpath的基本使用

    # xpath需要安装lxml
    # pip install lxml
    from lxml import etree
    
    
    html_doc = """
    <html>
        <head>
            <title>呵呵</title>
        </head>
        <body>
            <ul>
                <li class="item" id="box1">
                    <a href="aabb">打仗1</a>
                </li>
                <li class="item" id="box2">
                    <a href="aabbcc">打仗2</a>
                </li>
                <li class="item" id="box3">
                    <a href="bbccdd">打仗3</a>
                </li>
                <li class="item" id="box4">
                    <a href="ddee">打仗4</a>
                </li>
            </ul>
            <p class="item">
                <a href="aabb">打仗5</a>
            </p>
        </body>
    </html>
    """
    # 使用xpath
    # 创建etree对象
    mytree = etree.HTML(html_doc)
    # print(mytree)  # <Element html at 0x1feda822e08>
    # print(type(mytree))  # <class 'lxml.etree._Element'>
    # /:子节点
    # //:后代节点
    # print(mytree.xpath('/html'))  # html标签
    # print(mytree.xpath('/html/head'))  # head标签
    # print(mytree.xpath('/html/body/ul/li'))  # 所有li标签
    # print(mytree.xpath('//li'))  # 所有li标签
    # print(mytree.xpath('//li')[1])  # 第二个li标签,得到etree对象
    # print(mytree.xpath('//li[2]/@id'))
    # text():文本内容
    # li_list = mytree.xpath('//li')
    # for li in li_list:
    #     # 里面的.表示当前节点,不能省略
    #     content = li.xpath('./a/text()')  # 文本内容
    #     attr = li.xpath('./@id')  # 属性值
    #     print(content, attr)
    # 谓语:加条件
    # 谓词写在[]中
    # print(mytree.xpath('//li[1]/a/text()'))  # ['打仗1']
    # print(mytree.xpath('//li[last()]/a/text()'))  # ['打仗4']
    # print(mytree.xpath('//li[last()-1]/a/text()'))  # ['打仗3'],倒数第二个
    # print(mytree.xpath('//li[position()<3]/a/text()'))  # ['打仗1', '打仗2']
    # print(mytree.xpath('//li[position()>=3]/a/text()'))  # ['打仗3', '打仗4']
    # print(mytree.xpath('//li[@id="box1"]/a/text()'))  # ['打仗1']
    # print(mytree.xpath('//li[@class="item"]/a/text()'))  # ['打仗1', '打仗2', '打仗3', '打仗4']
    # *通配符
    # print(mytree.xpath('//*[@class="item"]/a/text()'))  # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']
    # |或
    # print(mytree.xpath('//li[@class="item"]/a/text() | //p[@class="item"]/a/text()'))  # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']  # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']
    # 包含contains()
    # print(mytree.xpath('//li/a[contains(@href, "aa")]/text()'))  # ['打仗1', '打仗2']
    print(mytree.xpath('//li/a[contains(text(), "2")]/text()'))  # ['打仗2']
    

    xpath解析股票基金数据

    import requests
    from lxml import etree
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    # 股票网址
    url = 'http://quote.stockstar.com/fund/stock.shtml'
    response = requests.get(url, headers=headers)
    content = response.content.decode('gb2312')
    # print(content)
    # xpath解析网页内容
    mytree = etree.HTML(content)
    tr_list = mytree.xpath('//tbody[@id="datalist"]/tr')
    for i, tr in enumerate(tr_list):
        f_code = tr.xpath('./td[1]/a/text()')[0]
        f_name = tr.xpath('./td[2]/a/text()')[0]
        f_unit = tr.xpath('./td[3]/text()')[0]
        # csv文件
        with open('fund.csv', 'a', encoding='gb2312') as fp:
            if i==0:
                fp.write('基金代码,基金名称,单位净值
    ')
            f = f'{f_code},{f_name},{f_unit}
    '
            fp.write(f)
            fp.flush()
    

    selenium和验证码破解

    超级鹰破解验证码

    import requests
    from hashlib import md5
    
    
    class Chaojiying_Client(object):
        def __init__(self, username, password, soft_id):
            self.username = username
            password = password.encode('utf8')
            self.password = md5(password).hexdigest()
            self.soft_id = soft_id
            self.base_params = {
                'user': self.username,
                'pass2': self.password,
                'softid': self.soft_id,
            }
            self.headers = {
                'Connection': 'Keep-Alive',
                'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
            }
        def PostPic(self, im, codetype):
            """
            im: 图片字节
            codetype: 题目类型 参考 http://www.chaojiying.com/price.html
            """
            params = {
                'codetype': codetype,
            }
            params.update(self.base_params)
            files = {'userfile': ('ccc.jpg', im)}
            r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
            return r.json()
        def ReportError(self, im_id):
            """
            im_id:报错题目的图片ID
            """
            params = {
                'id': im_id,
            }
            params.update(self.base_params)
            r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
            return r.json()
    
    
    if __name__ == '__main__':
        chaojiying = Chaojiying_Client('lotuslaw', '******', '908114')
        # 用户中心>>软件ID 生成一个替换 96001
        img = open('../a.jpg', 'rb').read()
        #本地图片文件路径 来替换 a.jpg
        print(chaojiying.PostPic(img, 1902))
        # 1902 验证码类型
    

    selenium的基本用法

    import time
    from selenium import webdriver
    
    # 创建浏览器驱动
    # 可以手动配置驱动的路径
    # 将chromedriver.exe放到python.exe同目录
    from selenium.webdriver.common.keys import Keys
    driver = webdriver.Chrome()
    # 打开浏览器
    driver.get('http://www.baidu.com')
    # 获取网页源码
    # print(driver.page_source)
    # 关闭
    # time.sleep(5)
    # driver.close()  # 只关闭一个窗口
    # driver.quit()  # 退出,关闭所有窗口
    # 百度贴吧
    driver.get('https://tieba.baidu.com/index.html')
    '''
    # 查找元素
    wd1 = driver.find_element_by_id('wd1')
    # wd1.send_keys('美女', Keys.ENTER)  # 给输入框填充内容,自动按回车
    time.sleep(2)
    wd1.send_keys('美女')
    # 点击按钮
    btn = driver.find_element_by_xpath('//a[@class="search_btn search_btn_enter_ba j_enter_ba"]')
    # btn.click()
    # 获取内容和属性值
    print(btn.get_attribute('innerText'))  # innerText, innerHTML
    print(wd1.get_attribute('value'))  # 输入框的值
    '''
    # 执行js
    time.sleep(3)
    # 执行JS脚本
    # driver.execute_script('window.scrollBy(0, 5000)')
    for i in range(5):
        driver.execute_script('window.scrollBy(0,5000)')
        time.sleep(2)
    # 截图
    # driver.save_screenshot('teiba.png')
    

    selenium登录知乎

    import time
    from selenium import webdriver
    
    
    # 知乎登录页面
    url = 'https://www.zhihu.com/signin?next=%2F'
    # 打开知乎页面
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(2)
    # 点击qq
    driver.find_element_by_xpath('//*[@class="Button Login-socialButton Button--plain"][2]').click()
    # 停10秒,手动扫码登录
    time.sleep(10)
    # 刷新页面
    driver.refresh()
    # 获取页面
    print(driver.page_source)
    print(driver.get_cookies())
    

    headless无头浏览器

    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    
    # 在正常页面跑通流程后,使用无头浏览器节约资源
    options = ChromeOptions()
    options.add_argument('--headless')  # 无头浏览器
    options.add_argument('--disable-gpu')  # 禁用GPU
    # 创建驱动对象
    driver = webdriver.Chrome(options=options)
    driver.get('http://www.baidu.com')
    print(driver.page_source)
    

    selenium设置代理

    from selenium import webdriver
    
    
    options = webdriver.ChromeOptions()
    # 设置代理IP
    options.add_argument('--proxy-sever=http://58.218.200.226:8256')
    # 创建驱动
    driver = webdriver.Chrome(options=options)
    driver.get('http://www.baidu.com')
    print(driver.page_source)
    

    超级鹰人人网验证码破解登录

    from Day03.chaojiying import chaojiying
    import requests
    import random
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
    }
    
    def get_code():
        url = 'http://icode.renren.com/getcode.do?t=web_login&rnd=' + str(random.random())
    
        # 获取验证码图片
        res = session.get(url, headers=headers)
        content = res.content  # 图片二进制
    
        # 使用超级鹰破解
        cjy = chaojiying.Chaojiying_Client('lotuslaw', '******', '908114')
        code = cjy.PostPic(content, 1902)
        # print(code)
        return code
    
    
    def login(code):
        # 登录接口抓取:给一个错误的密码进行登录
        login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020831616448'
    
        login_data = {
            "email": "18566218480",
            "icode": code,
            "origURL": "http://www.renren.com/home",
            "domain": "renren.com",
            "key_id": "1",
            "captcha_type": "web_login",
            "password": "88d7f48bf698c0f1b0dcca94bfb40361c6c82ced70f8cbf0619d725e0341d2e5",
            "rkey": "e8d80414c49ceb424291126858ee6226",
            "f": ''
        }
        # 发送请求
        res = session.post(login_url, data=login_data, headers=headers)
        content = res.text
        print(content)
        
        
    # 登录后访问个人中心
    def user_center():
        url = 'http://www.renren.com/480089210/profile'
        res = session.get(url, headers=headers)
        print(res.text)
    
    
    if __name__ == '__main__':
        session = requests.session()
        code = get_code()
        login(code)
        user_center()
    

    Scrapy基础

    • Scrapy框架介绍

      • Scrapy是用纯Python实现的一个为了爬取网站数据、提取结构性数据而编写的应用框架,用途非常广泛
      • 用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页内容以及各种图片,非常之方便
      • Scrapy 使用了Twisted(其主要对手是Tornado)多线程异步网络框架来处理网络通讯,可以加快我们的下载速度,不用自己去实现异步框架,并且包含了各种中间件接口,可以灵活的完成各种需求
    • Scrapy框架组件

      • Scrapy Engine(引擎)
        • 负责Spider、ItemPipeline、Downloader、Scheduler中间的通讯,信号、数据传递等
      • Scheduler(调度器)
        • 它负责接受引擎发送过来的Request请求,并按照一定的方式进行整理排列,入队,当引擎需要时,交还给引擎
      • Downloader(下载器)
        • 负责下载Scrapy Engine(引擎)发送的所有Requests请求,并将其获取到的Responses交还给Scrapy Engine(引擎),由引擎交给Spider来处理
      • Spider(爬虫)
        • 它负责处理所有Responses,从中分析提取数据,获取Item字段需要的数据,并将需要跟进的URL提交给引擎,再次进入Scheduler(调度器)
      • Item Pipeline(管道)
        • 它负责处理Spider中获取到的Item,并进行后期处理(详细分析、过滤、存储等)的地方
      • Downloader Middlewares(下载中间件)
        • 你可以当作是一个可以自定义扩展下载功能的组件
      • Spider Middlewares(Spider中间件)
        • 你可以理解为是一个可以自定扩展和操作引擎Spider中间通信的功能组件(比如进入Spider的Responses和从Spider出去的Requests)
    • 安装使用

      • 安装

      • 使用

        • 新建项目

          • 进入项目存放的目录
          • scrapy startproject meiju
        • 创建爬虫程序

          • 进入项目目录
          • scrapy genspider meijuSpider meijutt.tv
            • meijuSpider为爬虫文件名
            • meijutt.tv为爬取网址的域名
          • 工程文件介绍
            • scrapy.cfg
              • 项目的配置信息,主要为Scrapy命令行工具提供一个基础的配置信息。(真正爬虫相关的配置信息在settings.py文件中)
            • items.py
              • 设置数据存储模板,用于结构化数据,如:Django的Model
            • pipelines
              • 数据处理行为,如:一般结构化的数据持久化
            • settings.py
              • 配置文件,如:递归的层数、并发数,延迟下载等
            • spiders
              • 爬虫目录,如:创建文件,编写爬虫规则
        • 定义Item

          • class MeijuItem(scrapy.Item):
                name = scrapy.Field()
            
        • 编写爬虫

          • 修改起始爬取的url

          • 数据处理

          • def parse(self, response):
                item = MeijuItem()
                item['name'] = name
                yield item
            
        • 启用一个Item Pipeline组件,在settings.py开启

          • ITEM_PIPELINES = {
               'meiju.pipelines.MeijuPipeline': 300,
            }
            
        • 编写 Pipeline 来存储提取到的Item

          • 快速存储

            • 在运行文件中存储

            • scrapy crawl meijuSpider -o meiju.json
              scrapy crawl meijuSpider -o meiju.csv
              scrapy crawl meijuSpider -o meiju.xml
              
        • 运行爬虫

          • 命令行运行

            • scrapy crawl meijuSpider
            • scrapy crawl meijuSpider --nolog
          • 新建一个运行文件start

          • # 执行scrapy命令:开启爬虫
            scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju'])
            # scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju', '--nolog'])
            scrapy.cmdline.execute('scrapy crawl mymeiju'.split())
            # scrapy.cmdline.execute('scrapy crawl mymeiju --nolog'.split())
            

    Scrapy爬取美剧网

    • mymeiju.py
    import scrapy
    from ..items import MeijuItem
    
    
    class MymeijuSpider(scrapy.Spider):
        # 爬虫名:唯一
        name = 'mymeiju'
        # 允许的域名列表
        allowed_domains = ['meijutt.tv']
        # 开始的url列表:启动项目后回直接自动爬取的url列表
        start_urls = ['https://www.meijutt.tv/new100.html']
        # 解析数据方法:
        # 1.当start_urls中的网页请求完成后回自动调用当前的parse方法,并返回响应
        def parse(self, response, **kwargs):
            print('*' * 100)
            # print(response)
            # print(type(response))  # <class 'scrapy.http.response.html.HtmlResponse'>
            # print(response.text)  # 获取文本内容
            # print(response.body)  # 二进制内容
            # print(response.json())  # 解析json
            print('*' * 100)
            # 解析数据:xpath
            li_list = response.xpath('//ul[@class="top-list  fn-clear"]/li')
            for li in li_list:
                # 有3种方式获取内容
                # name = li.xpath('./h5/a/text()').get()
                # name = li.xpath('./h5/a/text()')[0].extract()
                # name = li.xpath('./h5/a/text()').extract_first()
                # name = li.xpath('./span/text()').getall()  # 获取所有匹配的内容,他是一个列表
                name = li.xpath('./h5/a/text()').get()  # 剧名
                state = li.xpath('./span[1]/font/text()').get()  # 状态:级数
                mjzm = li.xpath('./span[2]/em/text()').get()  # 字幕
                mjjq = li.xpath('./span[3]/text()').get()  # 分类
                mjtv = li.xpath('./span[4]/text()').get()  # 电视台
                mjtime = li.xpath('./div[last]/font/text()').get()  # 更新时间
                if not mjtime:
                    mjtime = li.xpath('./div[last()]/text()').get()
                # print(name)
                # item:封装每个数据
                # item = MeijuItem()
                # item['name'] = name  # 不能用点语法
                item = MeijuItem(
                    name=name, state=state, mjzm=mjzm,
                    mjjq=mjjq, mjtv=mjtv, mjtime=mjtime
                )
                # 生成器,既是迭代器,又是可迭代对象
                yield item
                # 这里的item回传入到pipelines中,需要做两个事情
                # 1.需要在parse方法中yield item
                # 2.需要在settings中将ITEM_PIPELINES设置好
                # yield返回2种值
                # 1.返回item
                # 2.返回Request/FormRequest
    
    • items.py
    import scrapy
    
    
    # Item: 类似Django种的Model
    class MeijuItem(scrapy.Item):
        name = scrapy.Field()
        state = scrapy.Field()
        mjzm = scrapy.Field()
        mjjq = scrapy.Field()
        mjtv = scrapy.Field()
        mjtime = scrapy.Field()
    
    • pipelines.py
    from itemadapter import ItemAdapter
    
    
    # pipeline:专门用来存储数据
    class MeijuPipeline:
        # 开始爬虫:自动调用该函数一次
        def open_spider(self, spider):
            pass
            # 打开文件
            # self.fp = open('meiju.txt', 'a', encoding='utf-8')
            # print('开始爬取......')
    
        # 关闭爬虫:自动调用
        def close_spider(self, spider):
            pass
            # 关闭文件
            # self.fp.close()
            # print('爬虫结束!')
    
        # process_item:会被调用很多次(取决于yield item的次数)
        def process_item(self, item, spider):
            # print(spider.name)  # 爬虫名
            # print(f'item:{item}', type(item))
            # 写入文件
            # with open('meiju.txt', 'a', encoding='utf-8') as fp:
            #     fp.write(str(item) + '
    ')
            # self.fp.write(str(item) + '
    ')
            # print(f'{item["name"]}写入成功')
            return item
    
    • settings.py
    BOT_NAME = 'meiju'
    SPIDER_MODULES = ['meiju.spiders']
    NEWSPIDER_MODULE = 'meiju.spiders'
    ROBOTSTXT_OBEY = False
    ITEM_PIPELINES = {
       'meiju.pipelines.MeijuPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    # 执行scrapy命令:开启爬虫
    # scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju'])
    # scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju', '--nolog'])
    
    # 使用split
    # scrapy.cmdline.execute('scrapy crawl mymeiju'.split())
    # scrapy.cmdline.execute('scrapy crawl mymeiju --nolog'.split())
    
    # 快速存储成指定格式的文件
    # 支持的文件格式('json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle')
    # scrapy.cmdline.execute('scrapy crawl mymeiju -o meiju2.json'.split())
    scrapy.cmdline.execute('scrapy crawl mymeiju -o meiju2.csv'.split())
    

    Scrapy爬取当当网

    • dangdang_spider.py
    import scrapy
    from ..items import DangdangItem
    
    
    class DangdangSpiderSpider(scrapy.Spider):
        name = 'dangdang_spider'
        allowed_domains = ['dangdang.com']
        start_urls = ['http://category.dangdang.com/pg1-cp01.01.02.00.00.00.html']
        def parse(self, response, **kwargs):
            li_list = response.xpath('//ul[@id="component_59"]/li')
            for li in li_list:
                book_name = li.xpath('./a/@title').get()
                book_price = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()').get()
                book_author = li.xpath('./p[@class="search_book_author"]/span[1]/a/text()').get()
                book_publishers = li.xpath('./p[@class="search_book_author"]/span[3]/a/text()').get()
                book_star = li.xpath('./p[@class="search_star_line"]/span/span/@style').get()[6:-1]
                book_comment = li.xpath('./p[4]/a/text()').get()
                book_picture = li.xpath('./a/img/@data-original')
                if book_picture:
                    book_picture = book_picture.get()
                else:
                    book_picture = li.xpath('./a/img/@src').get()
                print(book_picture)
                item = DangdangItem(
                    book_name=book_name,
                    book_price=book_price,
                    book_author=book_author,
                    book_publishers=book_publishers,
                    book_star=book_star,
                    book_comment=book_comment,
                    book_picture=book_picture
                )
                yield item
    
    • items.py
    import scrapy
    
    
    class DangdangItem(scrapy.Item):
        book_name = scrapy.Field()
        book_price = scrapy.Field()
        book_author = scrapy.Field()
        book_publishers = scrapy.Field()
        book_star = scrapy.Field()
        book_comment = scrapy.Field()
        book_picture = scrapy.Field()
    
    • pipelines.py
    import pymysql
    
    
    class DangdangPipeline:
        def open_spider(self, spider):
            print('开始爬取')
            self.db = pymysql.connect(
                host='localhost',
                port=3306,
                user='root',
                password='******',
                database='spider2003',
                charset='utf8'
            )
            self.cur = self.db.cursor()
        def close_spider(self, spider):
            print('爬取结束')
            self.cur.close()
            self.db.close()
        def process_item(self, item, spider):
            # item['name'].replace('"', "'")  # 单引号替换双引号
            sql = 'insert into dangdang(book_name, book_price, book_author, book_publishers, book_star, book_comment, book_picture) values ("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (item['book_name'], item['book_price'], item['book_author'], item['book_publishers'], item['book_star'], item['book_comment'], item['book_picture'])
            try:
                self.cur.execute(sql)
                self.db.commit()
            except Exception as e:
                print(e)
                self.db.rollback()
            return item
    
    • settings.py
    BOT_NAME = 'dangdang'
    SPIDER_MODULES = ['dangdang.spiders']
    NEWSPIDER_MODULE = 'dangdang.spiders'
    ROBOTSTXT_OBEY = False
    ITEM_PIPELINES = {
       'dangdang.pipelines.DangdangPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    scrapy.cmdline.execute('scrapy crawl dangdang_spider'.split())
    

    Scrapy进阶

    • Scrapy Shell

    • Selectors选择器

      • Scrapy Selectors 内置 XPath 和 CSS Selector 表达式机制
      • Selector四个基本的方法(xpath最常用)
        • xpath()
          • 传入xpath表达式,返回该表达式所对应的所有节点的selector list列表
        • extract()
          • 序列化该节点为Unicode字符串并返回list, extract_first()
        • css()
          • 传入CSS表达式,返回该表达式所对应的所有节点的selector list列表,语法同 BeautifulSoup4中soup.select()
        • re()
          • 根据传入的正则表达式对数据进行提取,返回Unicode字符串list列表
    • Spider类

      • 概述
        • Spider类定义了如何爬取某个(或某些)网站。包括了爬取的动作(例如:是否跟进链接)以及如何从网页的内容中提取结构化数据(爬取item)。 换句话说,Spider就是你定义爬取的动作及分析某个网页(或者是有些网页)的地方
        • scrapy.Spider是最基本的类,所有编写的爬虫必须继承这个类
      • 主要用到的函数及调用顺序
        • _init_()
          • 初始化爬虫名字和start_urls列表
        • start_requests()
          • 调用make_requests_from_url():生成Requests对象交给Scrapy下载并返回response
        • parse(self, response)
          • 解析response,并返回Item或Requests(需指定回调函数)
          • Item传给Item pipline持久化,而Requests交由Scrapy下载,并由指定的回调函数处理(默认parse()),一直进行循环,直到处理完所有的数据为止
      • 主要属性和方法
        • name
          • 定义spider名字的字符串。唯一
        • allowed_domains
          • 包含了spider允许爬取的域名(domain)的列表,可选
        • start_urls
          • 初始URL元组/列表。当没有指定特定的URL时,spider将从该列表中开始进行爬取
        • start_requests(self)
          • 该方法必须返回一个可迭代对象(iterable)。该对象包含了spider用于爬取(默认实现是使用 start_urls 的url)的第一个Request
          • 当spider启动爬取并且未指定start_urls时,该方法被调用
        • parse(self, response)
          • 当请求url返回网页没有指定回调函数时,默认的Request对象回调函数。用来处理网页返回的response,以及生成Item或者Request对象
        • log(self, message[, level, component])
          • 使用 scrapy.log.msg() 方法记录日志信息
    • CrawlSpider类

      • 概述
        • CrawlSpider是Spider的派生类
        • Spider类的设计原则是只爬取start_urls列表中的网页
        • CrawlSpider类定义了一些规则(rule)来提供跟进link的方便的机制,从爬取的网页中获取link并继续爬取的工作更适合
          • 自动翻页
      • LinkExtractors
        • 概述
          • 使用LinkExtractors 的目的: 提取链接
          • 每个LinkExtractor有唯一的公共方法是 extract_links(),它接收一个 Response 对象,并返回一个 scrapy.link.Link 对象
        • 主要参数
          • allow
            • 满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配
          • deny
            • 与这个正则表达式(或正则表达式列表)匹配的URL一定不提取
          • allow_domains
            • 会被提取的链接的domains/域名
          • deny_domains
            • 一定不会被提取链接的domains
          • restrict_xpaths
            • 使用xpath表达式,和allow共同作用过滤链接/范围
      • rules
        • 概述
          • 在rules中包含一个或多个Rule对象,每个Rule对爬取网站的动作定义了特定操作
          • 如果多个Rule匹配了相同的链接,则根据规则在本集合中被定义的顺序,第一个会被使用
        • 主要参数
          • link_extractor
            • 是一个Link Extractor对象,用于定义需要提取的链接
          • callback
            • 从link_extractor中每获取到链接时,参数所指定的值作为回调函数,该回调函数接受一个response作为其第一个参数(尽量避免使用parse)
          • follow
            • 是一个布尔(boolean)值,指定了根据该规则从response提取的链接是否需要跟进
            • follow=True
              • 跟随:会自动匹配子网页中的其他符合规则的链接并爬取
          • process_links
            • 指定该spider中哪个的函数将会被调用,从link_extractor中获取到链接列表时将会调用该函数
            • 该方法主要用来过滤
          • process_request
            • 指定该spider中哪个的函数将会被调用, 该规则提取到每个request时都会调用该函数(用来过滤request)
    • Robots协议

      • 概述
        • Robots协议(也称为爬虫协议、机器人协议等)的全称是“网络爬虫排除标准”(Robots Exclusion Protocol),网站通过Robots协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取
        • robots.txt文件是一个文本文件。当一个搜索蜘蛛访问一个站点时,它会首先检查该站点根目录下是否存在robots.txt,如果存在,搜索机器人就会按照该文件中的内容来确定访问的范围;如果该文件不存在,所有的搜索蜘蛛将能够访问网站上所有没有被口令保护的页面
      • 使用
        • 禁止robots协议将 ROBOTSTXT_OBEY = True改为False
    • 深度爬取

      • 爬取到链接,进入链接继续爬取,爬取到链接,再次进入链接爬取......

      • yield scrapy.Request(
                        url=href,  # url链接
                        callback=self.parse_detail,  # 回调函数:请求成功后的响应
                        meta={'name': name}  # 传入到parse_detail中的数据
                    )
        
        • scrapy.Request异步爬取
      • name = response.meta['name']

        • 取出小说名
        • 逐级传递
      • yield BiqugeItem(name=name, zj_name=zj_name, zj_content=zj_content)

        • 将数据传入管道
    • 循环遍历实现翻页

      • # 爬取下一页
        if self.page <= 100:
            print(f'---开始爬取{self.page}页---')
            self.page = self.page + 1
            url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_%d.shtml' % self.page
            yield scrapy.Request(url, callback=self.parse)
        

    Scrapy爬取笔趣阁

    • biquege_spider.py
    import requests
    import scrapy
    from ..items import BiqugeItem
    
    
    class BiqugeSpiderSpider(scrapy.Spider):
        name = 'biquge_spider'
        allowed_domains = ['biquge5200.cc']
        start_urls = ['https://www.biquge5200.cc/xuanhuanxiaoshuo/']
        # 爬取笔趣阁的首页
        def parse(self, response, **kwargs):
            # 解析数据
            li_list = response.xpath('//div[@class="l"]/ul/li')
            for li in li_list:
                name = li.xpath('./span[@class="s2"]/a/text()').get()  # 小说名
                href = li.xpath('./span[@class="s2"]/a/@href').get()  # 小说链接
                # requests:同步
                # print(len(requests.get(href).text))
                # print('-' * 100)
                # 异步:scrapy.Request
                # 请求小说详情页
                yield scrapy.Request(
                    url=href,  # url链接
                    callback=self.parse_detail,  # 回调函数:请求成功后的响应
                    meta={'name': name}  # 传入到parse_detail中的数据
                )
        # 详情页
        def parse_detail(self, response):
            # 取出小说名
            name = response.meta['name']
            # 解析数据
            dd_list = response.xpath('//div[@id="list"]/dl/dd')
            for dd in dd_list:
                zj_name = dd.xpath('./a/text()').get()  # 章节名称
                zj_href = dd.xpath('./a/@href').get()  # 章节内容链接
                # 请求每个章节的小说内容
                yield scrapy.Request(
                    url=zj_href,
                    callback=self.parse_content,
                    meta={'name': name, 'zj_name': zj_name}
                )
        # 小说内容页
        def parse_content(self, response):
            # 取出小说名及章节名
            name = response.meta['name']
            zj_name = response.meta['zj_name']
            # 解析数据
            p_list = response.xpath('//*[@id="content"]/p/text()').getall()
            zj_content = '
    '.join(p_list)
            # item
            # 将数据传入管道
            yield BiqugeItem(name=name, zj_name=zj_name, zj_content=zj_content)
    
    • items.py
    import scrapy
    
    
    class BiqugeItem(scrapy.Item):
        name = scrapy.Field()
        zj_name = scrapy.Field()
        zj_content = scrapy.Field()
    
    • pipelines.py
    import os
    from itemadapter import ItemAdapter
    
    
    class BiqugePipeline:
        # def __init__(self):
        #     self.path = r'C:Users86188DesktopSpiderDay05scrapy_projectiqugeooks'
        def process_item(self, item, spider):
            if not os.path.isdir("books/%s" % item['name']):
                os.mkdir("books/%s" % item['name'])
            else:
                with open('books/%s/%s.txt' % (item["name"], item["zj_name"]), 'a', encoding='utf-8') as fp:
                    fp.write(item["zj_content"])
                    fp.flush()
                print(f'item:{item["name"]}-{item["zj_name"]}')
            return item
    
    • settings.py
    BOT_NAME = 'biquge'
    SPIDER_MODULES = ['biquge.spiders']
    NEWSPIDER_MODULE = 'biquge.spiders'
    USER_AGENT = 'biquge (+http://www.yourdomain.com)'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    ITEM_PIPELINES = {
       'biquge.pipelines.BiqugePipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    scrapy.cmdline.execute('scrapy crawl biquge_spider'.split())
    

    Scrapy爬取京东

    • jd_spider.py
    import scrapy
    from selenium import webdriver
    from ..items import JdItem
    
    
    '''
    常见的反爬虫策略之一。
    这个参数的值,表明你是从哪个网页跳转过来的。
    比如说我请求获得淘宝评论的时候,他的referer是商品详情页面,表明我从这件商品详情页请求的相关评论,没有referer就不会给你这个评论
    from fake_useragent import UserAgent
    #伪装成浏览器
    ua = UserAgent()
    headers = {'User-Agent':ua.random} #一般网站伪装成这样也就够了,但是如果想爬图片,图片反盗链的话。如下
    #其实很好理解,就是告诉你要下载的那个图片页面,我是从主页面来的,现在把数据给我。
    headers = {'User-Agent':ua.random,'Referer':'这里放入图片的主页面'}
    #然后在后续requests中传入header即可
    '''
    class JdSpiderSpider(scrapy.Spider):
        name = 'jd_spider'
        allowed_domains = ['jd.com']
        start_urls = [
            # 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=1&s=1&click=0',
            # 'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=4&s=79&scrolling=y&log_id=1600660067305.2410&tpl=3_M&isList=1&show_items=',
            'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=6&s=131&scrolling=y&log_id=1600661434422.8716&tpl=3_M&isList=1&show_items='
        ]
        page1 = 1
        # page2 = 2
        s1 = 1
        # s2 = 27
        def parse(self, response, **kwargs):
            # driver = webdriver.Chrome()
            # driver.execute_script('window.scrollBy(0,10000)')
            li_list = response.xpath('//li[@class="gl-item"]')
            print(len(li_list))
            for li in li_list:
                shoes_name = li.xpath('./div/div[@class="p-img"]/a/@title').get()
                shoes_price = li.xpath('./div/div[@class="p-price"]/strong/i/text()').get()
                shoes_picture = li.xpath('./div/div[@class="p-img"]/a/img/@data-lazy-img').get()
                print(shoes_name, shoes_price, shoes_picture)
                yield JdItem(shoes_name=shoes_name, shoes_price=shoes_price, shoes_picture=shoes_picture)
            # driver.close()
            # if self.page1 <= 10:
            # # if self.page2 <= 200:
            #     print(f'---开始爬取{self.page1}页---')
            #     # print(f'---开始爬取{self.page2}页---')
            #     self.page1 = self.page1 + 2
            #     self.s1 = self.s1 + 52
            #     # self.page2 = self.page2 + 2
            #     # self.s2 = self.s2 + 52
            #     url = f'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page={self.page1}&s={self.s1}&click=0'
            #     # url =  f'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page={self.page2}&s={self.s2}&scrolling=y&log_id=1600431181482.2679&tpl=3_M&isList=1&show_items='
            #
            #
            #     yield scrapy.Request(url, callback=self.parse)
    
    • items.py
    import scrapy
    
    
    class JdItem(scrapy.Item):
        shoes_name = scrapy.Field()
        shoes_price = scrapy.Field()
        shoes_picture = scrapy.Field()
    
    • pipelines.py
    import pymysql
    from itemadapter import ItemAdapter
    
    
    class JdPipeline:
        def open_spider(self, spider):
            print('连接数据库')
            self.db = pymysql.connect(
                user='root', password='******',database='spider2003'
            )
            self.cur = self.db.cursor()
        def close_spider(self, spider):
            print('关闭连接')
            self.cur.close()
            self.db.close()
        def process_item(self, item, spider):
            sql = 'insert into jd (shoes_name, shoes_price, shoes_picture) values ("%s", "%s", "%s")' % (item['shoes_name'], item['shoes_price'], item['shoes_picture'])
            try:
                self.cur.execute(sql)
                self.db.commit()
            except Exception as e:
                print(e)
                self.db.rollback()
            return item
    
    • settings.py
    BOT_NAME = 'jd'
    SPIDER_MODULES = ['jd.spiders']
    NEWSPIDER_MODULE = 'jd.spiders'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
      'referer': 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=3&s=53&click=0'
    }
    ITEM_PIPELINES = {
       'jd.pipelines.JdPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    # scrapy.cmdline.execute('scrapy crawl jd_spider --nolog'.split())
    scrapy.cmdline.execute('scrapy crawl jd_spider'.split())
    

    Scrapy爬取糗事百科

    • qsbk_spider.py
    import scrapy
    from scrapy.spiders import CrawlSpider, Rule
    from scrapy.linkextractors import LinkExtractor
    from ..items import QiushibaikeItem
    # 导入日志模块
    import logging
    # 配置日志输出格式
    LOG_FORMAT = "%(asctime)s - %(levelname)s - %(module)s - %(message)s"  # 设置输出格式
    DATE_FORMAT = "%Y/%m/%d %H:%M:%S"  # 设置时间格式
    logging.basicConfig(filename='qsbk.log', filemode='a+', format=LOG_FORMAT, datefmt=DATE_FORMAT)
    
    
    class QsbkSpiderSpider(CrawlSpider):
    # class QsbkSpiderSpider(scrapy.Spider):
        name = 'qsbk_spider'
        allowed_domains = ['qiushibaike.com']
        start_urls = ['https://www.qiushibaike.com/text/page/1/']
        rules = [
            Rule(
                LinkExtractor(
                    allow=('/text/page/d+/',),
                    restrict_xpaths=('//ul[@class="pagination"]',)
                ),
                callback="parse_item",
                follow=True
            )
        ]
        def parse_item(self, response, **kwargs):
            div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
            for div in div_list:
                author = div.xpath('./div[@class="author clearfix"]/a[2]/h2/text()').get()
                content = div.xpath('./a[@class="contentHerf"]/div/span/text()').getall()  # 有br换行时,要用getall,但是要处理结果
                logging.info(f'download:{author}')
                yield QiushibaikeItem(author=author, content=content)
    
    • items.py
    import scrapy
    
    
    class QiushibaikeItem(scrapy.Item):
        author = scrapy.Field()
        content = scrapy.Field()
    
    • pipelines.py
    import os
    import random
    from itemadapter import ItemAdapter
    
    
    class QiushibaikePipeline:
        def process_item(self, item, spider):
            with open('cross_talk/%s-%f.txt' % (item['author'].replace('
    ', ''), random.random()), 'w', encoding='utf-8') as fp:
                fp.write((''.join(item['content'])).replace('
    ', ''))
                fp.flush()
            return item
    
    • settings.py
    BOT_NAME = 'qiushibaike'
    SPIDER_MODULES = ['qiushibaike.spiders']
    NEWSPIDER_MODULE = 'qiushibaike.spiders'
    USER_AGENT = 'qiushibaike (+http://www.yourdomain.com)'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    ITEM_PIPELINES = {
       'qiushibaike.pipelines.QiushibaikePipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    # scrapy.cmdline.execute('scrapy crawl qsbk_spider --nolog'.split())
    scrapy.cmdline.execute('scrapy crawl qsbk_spider'.split())
    

    Scrapy爬取新浪新闻

    • news_spider.py
    import scrapy
    from ..items import SinaNewsItem
    
    
    class NewsSpiderSpider(scrapy.Spider):
        name = 'news_spider'
        allowed_domains = ['sina.com.cn']
        start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_1.shtml']
        # 自定义类属性
        page = 1
        def parse(self, response, **kwargs):
            li_list = response.xpath('//ul[@class="list_009"]/li')
            for li in li_list:
                news = li.xpath('./a/text()').get()
                news_time = li.xpath('./span/text()').get()
                news_link = li.xpath('./a/@href').get()
    
                item = SinaNewsItem(
                    news=news,
                    news_time=news_time,
                    news_link=news_link,
                )
                yield item
            # 爬取下一页
            if self.page <= 100:
                print(f'---开始爬取{self.page}页---')
                self.page = self.page + 1
                url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_%d.shtml' % self.page
                yield scrapy.Request(url, callback=self.parse)
    
    • items.py
    import scrapy
    
    
    class SinaNewsItem(scrapy.Item):
        news = scrapy.Field()
        news_time = scrapy.Field()
        news_link = scrapy.Field()
    
    • pipelines.py
    import pymysql
    from itemadapter import ItemAdapter
    
    
    class SinaNewsPipeline:
        def open_spider(self, spider):
            print('开始爬取')
            self.db = pymysql.connect(
                host='localhost',
                port=3306,
                user='root',
                password='******',
                database='spider2003',
                charset='utf8'
            )
            self.cur = self.db.cursor()
        def close_spider(self, spider):
            print('爬取结束')
            self.cur.close()
            self.db.close()
        def process_item(self, item, spider):
            news = item['news']
            news_time = item['news_time']
            news_link = item['news_link']
            try:
                sql = 'insert into sina_news(news, news_time, news_link) values ("%s", "%s", "%s")' % (news, news_time, news_link)
                self.cur.execute(sql)
                self.db.commit()
            except Exception as e:
                print(e)
                self.db.rollback()
            return item
    
    • settings.py
    BOT_NAME = 'sina_news'
    SPIDER_MODULES = ['sina_news.spiders']
    NEWSPIDER_MODULE = 'sina_news.spiders'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    ITEM_PIPELINES = {
       'sina_news.pipelines.SinaNewsPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    scrapy.cmdline.execute('scrapy crawl news_spider'.split())
    

    Scrapy高级

    • 日志logging

      • Scrapy提供的log功能

        • 可以修改配置文件settings.py,任意位置添加下面两行,效果会清爽很多

          • LOG_ENABLED = True  # 开启
            LOG_FILE = "mySpider.log" #日志文件名
            LOG_LEVEL = "INFO" #日志级别
            
        • Log levels

          • Scrapy提供5层logging级别
            • CRITICAL - 严重错误(critical)
            • ERROR - 一般错误(regular errors)
            • WARNING - 警告信息(warning messages)
            • INFO - 一般信息(informational messages)
            • DEBUG - 调试信息(debugging messages)
        • logging设置

          • 通过在setting.py中进行以下设置可以被用来配置logging
            • LOG_ENABLED
              • 默认: True,启用logging
            • LOG_ENCODING
              • 默认: 'utf-8',logging使用的编码
            • LOG_FILE
              • 默认: None,在当前目录里创建logging输出文件的文件名
            • LOG_LEVEL
              • 默认: 'DEBUG',log的最低级别
          • scrapy的日志模块已经被scrapy弃用
      • 使用python自带日志模块

        • import logging
          LOG_FORMAT = "%(asctime)s - %(levelname)s - %(module)s - %(message)s"  # 设置输出格式
          DATE_FORMAT = "%Y/%m/%d %H:%M:%S"  # 设置时间格式
          logging.basicConfig(filename='sina.log', filemode='a+', format=LOG_FORMAT, datefmt=DATE_FORMAT)
          logging.warning('错误')
          
    • settings配置

      • 概述

        • Scrapy设置(settings)提供了定制Scrapy组件的方法。可以控制包括核心(core),插件(extension),pipeline及spider组件
      • 设置

        • BOT_NAME

          • 默认: 'scrapybot'
          • Scrapy项目实现的bot的名字(也为项目名称)。 这将用来构造默认 User-Agent,同时也用来log
          • 当您使用startproject命令创建项目时其也被自动赋值
        • CONCURRENT_ITEMS

          • 默认: 100
          • Item Processor(即 Item Pipeline同时处理每个response(item)的最大值
        • CONCURRENT_REQUESTS

          • 默认: 16
          • Scrapy downloader 并发请求(concurrent requests)的最大值
        • CONCURRENT_REQUESTS_PER_DOMAIN

          • 默认: 8
          • 对单个网站进行并发请求的最大值
        • CONCURRENT_REQUESTS_PER_IP

          • 默认: 0
          • 对单个IP进行并发请求的最大值
          • 如果非0,则忽略CONCURRENT_REQUESTS_PER_DOMAIN,设定, 使用该设定
        • DEFAULT_REQUEST_HEADERS

          • DEFAULT_REQUEST_HEADERS = {
              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
              'Accept-Language': 'en',
            }
            
          • Scrapy HTTP Request使用的默认header,由DefaultHeadersMiddleware产生

        • DEPTH_LIMIT

          • 默认: 0
          • 爬取网站最大允许的深度(depth)值。如果为0,则没有限制
        • DOWNLOADER

          • 默认: 'scrapy.core.downloader.Downloader'
          • 用于crawl的downloader
        • DOWNLOADER_MIDDLEWARES

          • 默认:{}
          • 保存项目中启用的下载中间件及其顺序的字典
        • DOWNLOADER_MIDDLEWARES_BASE

          • 默认{...}
          • 包含Scrapy默认启用的下载中间件的字典。 永远不要在项目中修改该设定
        • DOWNLOAD_DELAY

          • 默认: 0下载器在下载同一个网站下一个页面前需要等待的时间。该选项可以用来限制爬取速度, 减轻服务器压力。同时也支持小数
        • DOWNLOAD_TIMEOUT

          • 默认: 180
          • 下载器超时时间(单位: 秒)
        • ITEM_PIPELINES

          • 默认: {}
          • 保存项目中启用的pipeline及其顺序的字典。该字典默认为空,值(value)任意。 不过值(value)习惯设定在0-1000范围内,越小,优先级越高
        • ITEM_PIPELINES_BASE

          • 默认: {}
          • 保存项目中默认启用的pipeline的字典。 永远不要在项目中修改该设定,而是修改ITEM_PIPELINES
        • LOG_ENABLED

          • 默认:True
          • 是否启用logging
        • LOG_ENCODING

          • 默认: 'utf-8'
          • logging使用的编码
        • LOG_FILE

          • 默认: None
          • logging输出的文件名。如果为None,则使用标准错误输出(standard error)
        • LOG_LEVEL

          • 默认: 'DEBUG'
          • 可选的级别有: CRITICAL、 ERROR、WARNING、INFO、DEBUG
        • LOG_STDOUT

          • 默认: False
          • 如果为 True ,进程所有的标准输出(及错误)将会被重定向到log中。例如, 执行 print 'hello' ,其将会在Scrapy log中显示
        • REDIRECT_MAX_TIMES

          • 默认: 20
          • 定义request允许重定向的最大次数。超过该限制后该request直接返回获取到的结果。 对某些任务我们使用Firefox默认值
        • ROBOTSTXT_OBEY

          • 默认:True
          • 如果启用,Scrapy将会遵守 robots.txt策略
        • SCHEDULER

          • 默认:{}
          • 保存项目中启用的下载中间件及其顺序的字典
        • SPIDER_MIDDLEWARES_BASE

          • 默认:{...}
          • 保存项目中默认启用的spider中间件的字典。 永远不要在项目中修改该设定,而是修改SPIDER_MIDDLEWARES
        • SPIDER_MODULES

          • 默认: []
          • Scrapy搜索spider的模块列表
        • URLLENGTH_LIMIT

          • 默认: 2083
          • 爬取URL的最大长度
        • USER_AGENT

          • 默认: "Scrapy/VERSION (+http://scrapy.org)"
          • 爬取的默认User-Agent,除非被覆盖
        • REACTOR_THREADPOOL_MAXSIZE

          • 线程池数量,默认10条
    • 自定义中间件

      • 中间件种类

        • process_request(self, request, spider)

          • 当每个request通过下载中间件时,该方法被调用
        • process_response(self, request, response, spider)

          • 当下载器完成http请求,传递响应给引擎的时候调用
        • 自定义

          • 创建中间件类

          • # 随机的User-Agent
            class RandomUserAgent(object):
                def process_request(self, request, spider):
                    useragent = random.choice(USER_AGENTS)
                    request.headers.setdefault("User-Agent", useragent)
            
          • # 随机代理IP
            class RandomProxy(object):
                def process_request(self, request, spider):
                    proxy = random.choice(PROXIES)
                    request.meta['proxy'] = "http://" + proxy['ip_port']
            
          • 配置中间件

            • 最后设置setting.py里的DOWNLOADER_MIDDLEWARES,添加自己编写的下载中间件类

            • DOWNLOADER_MIDDLEWARES = {
                 'baidu.middlewares.BaiduDownloaderMiddleware': 543,
                 # 配置中间件
                 'baidu.middlewares.UADownloaderMiddleware': 300,
                 'baidu.middlewares.ProxyDownloaderMiddleware': 200,
              }
              
              USER_AGENTS = [
                  "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                  "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                  "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                  "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                  "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                  "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                  "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                  "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
              ]
              
              PROXIES = [
                  {'ip_port': '58.218.200.214:8730'},
                  {'ip_port': '58.218.200.247:2359'},
                  {'ip_port': '58.218.200.248:8503'},
                  {'ip_port': '58.218.200.229:4612'},
                  {'ip_port': '58.218.200.214:5570'},
                  {'ip_port': '58.218.200.214:8801'},
              ]
              
    • POST请求

      • 如果第一个请求是post

        • 需要注释掉start_urls属性,并重写start_request方法

        • def start_requests(self):
              yield scrapy.FormRequest(
                      url='http://fanyi.baidu.com/sug',
                      formdata={'kw': 'wolf'},
                      callback=self.parse_item
                  )
          
      • 如果第一个请求不是post

        • response = requests.post("http://www.baidu.com/", data = data, headers=headers)
          yield  scrapy.FormRequest(url=url,formdata=data,callback=self.parse_item)	
          

    爬取新片场(综合)

    • xpc_spider.py
    import scrapy
    from ..items import *
    
    
    class XpcSpiderSpider(scrapy.Spider):
        name = 'xpc_spider'
        allowed_domains = ['xinpianchang.com']
        start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
        def parse(self, response, **kwargs):
            # 解析数据
            # 视频列表数据
            li_list = response.xpath('//ul[@class="video-list"][1]/li')
            for li in li_list:
                # 作品id
                pid = li.xpath('./@data-articleid').get()
                # 作品标题
                title = li.xpath('./div/div[1]/a/p/text()').get()
                # 缩略图
                thumbnail = li.xpath('./a/img/@_src').get()
                category_list = li.xpath('.//div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').getall()
                # 分类
                category = '|'.join(category_list)
                category = category.replace(' ', '').replace('
    ', '').replace('	', '')
                # 发布时间
                created_at = li.xpath('.//p[@class="fs_12"]/text()').get()
                # item
                item = PostsItem()
                item['pid'] = pid
                item['title'] = title
                item['thumbnail'] = thumbnail
                item['category'] = category
                item['created_at'] = created_at
                # 进入详情页
                post_url = f'https://www.xinpianchang.com/a{pid}?from=ArticleList'
                request = scrapy.Request(url=post_url, callback=self.post_detail)
                request.meta['post_item'] = item
                yield request
        # 作品详情页
        def post_detail(self, response):
            post_item = response.meta.get('post_item')
            pid = post_item['pid']
            # 解析数据
            # 作品描述
            description_list = response.xpath('//p[@class="desc line-hide fs_14 c_b_3 fw_300 line-hide-3"]/text()').getall()
            description = ''.join(description_list)
            description = description.replace(' ', '').replace('
    ', '').replace('	', '')
            post_item['description'] = description
            # 播放次数
            play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
            post_item['play_counts'] = play_counts
            # 点赞次数
            like_counts = response.xpath('//span[@class="v-center like-counts fs_12 c_w_f fw_300"]/@data-counts').get()
            post_item['like_counts'] = like_counts
            # 视频数据
            # video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/ryM1l4365Wzwod2V?appKey=61a2f329348b3bf77&extend=userInfo%2CuserStatus'
            vid = response.xpath('//a[@class="collection-star hollow-star"]/@data-vid').get()
            video_url = f'https://mod-api.xinpianchang.com/mod/api/v2/media/{vid}?appKey=61a2f329348b3bf77&extend=userInfo%2CuserStatus'
            # 请求视频数据
            request = scrapy.Request(url=video_url, callback=self.video_detail)
            request.meta['post_item'] = post_item
            yield request
            # 创作者数据
            li_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul/li')
            for li in li_list:
                # 创作者id
                cid = li.xpath('./a/@data-userid').get()
                # item
                composer_item = ComposersItem()
                composer_item['cid'] = cid
                # 创作者url
                composer_url = li.xpath('./a/@href').get()
                composer_url = 'https://www.xinpianchang.com/' + composer_url
                # 访问创作者详情页
                request2 = scrapy.Request(url=composer_url, callback=self.composer_detail)
                request2.meta['composer_item'] = composer_item
                yield request2
                # 版权/角色数据
                cr_item = CopyrightsItem()
                cr_item['pcid'] = f'{pid}_{cid}'
                cr_item['pid'] = pid
                cr_item['cid'] = cid
                cr_item['roles'] = li.xpath('.//span[@class="roles fs_12 fw_300 c_b_9"]/text()').get()
                yield cr_item
            # 评论数据
            comment_url = f'https://app.xinpianchang.com/comments?resource_id={pid}&type=article&page=1&per_page=24'
            yield scrapy.Request(
                url=comment_url,
                callback=self.comment_detail
            )
        # 视频数据
        def video_detail(self, response):
            post_item = response.meta.get('post_item')
            # 解析数据
            content = response.json()
            # 视频预览图
            preview = content['data']['cover']
            # 视频链接
            video = content['data']['resource']['progressive'][0]['url']
            # 视频格式
            video_format = content['data']['resource']['progressive'][0]['mime']
            # 视频时长
            duration = content['data']['duration']
            # item
            post_item['preview'] = preview
            post_item['video'] = video
            post_item['video_format'] = video_format
            post_item['duration'] = duration
            # print(post_item)
            yield post_item
            # 创作者详情页
        def composer_detail(self, response):
            composer_item = response.meta.get('composer_item')
            # banner图
            banner = response.xpath('//div[@class="banner-wrap"]/@style').get()
            banner = banner[banner.find('(')+1: -1]
            # 用户头像
            avatar = response.xpath('//div[@class="banner-wrap"]/div/span/img/@src').get()
            # 是否加V
            verified = response.xpath('//div[@class="banner-wrap"]/div/span/span[contains(@class, "author-v")]').get()
            verified = 'yes' if verified else 'no'
            # 名字
            name = response.xpath('//p[@class="creator-name fs_26 fw_600 c_b_26"]/text()').get()
            # 自我介绍
            intro = response.xpath('//p[@class="creator-desc fs_14 fw_300 c_b_3 line-hide-1"]/text()').get()
            # 被点赞次数
            like_counts = response.xpath('//span[@class="like-counts fw_600 v-center"]/text()').get()
            like_counts = like_counts.replace(',', '')
            # 被关注数量
            fans_counts = response.xpath('//span[@class="fans-counts fw_600 v-center"]/text()').get()
            fans_counts = fans_counts.replace(',', '')
            # 关注数量
            follow_counts = response.xpath('//span[@class="follow-wrap"]/span[@class="fw_600 v-center"]/text()').get()
            follow_counts = follow_counts.replace(',', '')
            # 所在位置
            location = response.xpath('//span[@class="icon-location v-center"]/following-sibling::*/text()').get()
            location = location if location else ''
            # 职业
            career = response.xpath('//span[@class="icon-career v-center"]/following-sibling::*/text()').get()
            career = career if career else ''
            # item
            composer_item['banner'] = banner
            composer_item['avatar'] = avatar
            composer_item['verified'] = verified
            composer_item['name'] = name
            composer_item['intro'] = intro
            composer_item['like_counts'] = like_counts
            composer_item['fans_counts'] = fans_counts
            composer_item['follow_counts'] = follow_counts
            composer_item['location'] = location
            composer_item['career'] = career
            yield composer_item
        # 评论数据
        def comment_detail(self, response):
            content = response.json()
            comment_list = content['data']['list']
            for comment in comment_list:
                # 评论其他评论的数量
                reply = comment.get('referer')
                if reply:
                    reply = reply.get('id')
                else:
                    reply = 0
                item = CommentsItem(
                    commentid=comment['id'],
                    pid=comment['resource_id'],
                    cid=comment['userid'],
                    avatar=comment['userInfo']['avatar'],
                    uname=comment['userInfo']['username'],
                    created_at=comment['addtime'],
                    content=comment['content'],
                    like_counts=comment['count_approve'],
                    reply=reply
                )
                yield item
    
    • items.py
    from scrapy import Item, Field
    
    
    # 作品
    class PostsItem(Item):
        table_name = 'posts'  # 表名
        pid = Field()
        title = Field()
        thumbnail = Field()
        preview = Field()
        video = Field()
        video_format = Field()
        category = Field()
        duration = Field()
        created_at = Field()
        description = Field()
        play_counts = Field()
        like_counts = Field()
    class ComposersItem(Item):
        table_name = 'composers'  # 表名
        cid = Field()
        banner = Field()
        avatar = Field()
        verified = Field()
        name = Field()
        intro = Field()
        like_counts = Field()
        fans_counts = Field()
    
        follow_counts = Field()
        location = Field()
        career = Field()
    class CommentsItem(Item):
        table_name = 'comments'  # 表名
        commentid = Field()
        pid = Field()
        cid = Field()
        avatar = Field()
        uname = Field()
        created_at = Field()
        content = Field()
        like_counts = Field()
        reply = Field()
    # 版权:作者在作品中的角色
    class CopyrightsItem(Item):
        table_name = 'copyrights'  # 表名
        pcid = Field()
        pid = Field()
        cid = Field()
        roles = Field()
    
    • pipelines.py
    import pymysql
    from itemadapter import ItemAdapter
    
    
    class XpcPipeline:
        def open_spider(self, spider):
            print('---开始存入MySQL---')
            self.db = pymysql.connect(user='root', password='nzw19940611', database='xpc_2020')
            self.cur = self.db.cursor()
        def close_spider(self, spider):
            print('---存入MySQL结束---')
            self.cur.close()
            self.db.close()
        def process_item(self, item, spider):
            # 表名
            table_name = item.table_name
            keys = list(item.keys())
            values = list(item.values())
            # 所有字段组成的字符串
            key_str = ','.join(["`%s`" % key for key in keys])
            # 所有的值组成的字符串
            # value_str = ','.join(['"%s"' % value for value in values])
            value_str = ','.join(["%s"] * len(values))
            # 如果key冲突,则用新数据更新旧数据
            update_str = ','.join(["`{}`=%s".format(key) for key in keys])
            # sql
            sql = 'insert into `{}` ({}) values ({}) on duplicate key update {}'.format(
                table_name,
                key_str,
                value_str,
                update_str
            )
            # 执行sql
            self.cur.execute(sql, values*2)
            self.db.commit()
            print(f'---插入成功:{table_name}---')
            return item
    
    • settings.py
    BOT_NAME = 'xpc'
    SPIDER_MODULES = ['xpc.spiders']
    NEWSPIDER_MODULE = 'xpc.spiders'
    USER_AGENT = 'xpc (+http://www.yourdomain.com)'
    ROBOTSTXT_OBEY = False
    DOWNLOAD_DELAY = 3
    ITEM_PIPELINES = {
       'xpc.pipelines.XpcPipeline': 300,
    }
    
    • start.py
    import scrapy.cmdline
    
    
    scrapy.cmdline.execute('scrapy crawl xpc_spider --nolog'.split())
    # scrapy.cmdline.execute('scrapy crawl xpc_spider'.split())
    
  • 相关阅读:
    POJ 1426 Find The Multiple(数论——中国同余定理)
    POJ 2253 Frogger(Dijkstra变形——最短路径最大权值)
    POJ 3790 最短路径问题(Dijkstra变形——最短路径双重最小权值)
    POJ 3278 Catch That Cow(模板——BFS)
    HDU 1071 The area
    HDU 1213 How Many Tables(模板——并查集)
    POJ 1611 The Suspects
    light oj 1214 Large Division
    POJ 1258 Agri-Net(Prim算法求解MST)
    POJ 2387 Til the Cows Come Home(模板——Dijkstra算法)
  • 原文地址:https://www.cnblogs.com/lotuslaw/p/14665385.html
Copyright © 2011-2022 走看看