zoukankan      html  css  js  c++  java
  • python爬虫基础知识学习笔记

    python爬虫基础知识学习笔记

    镜像源(加快下载模块)

    国内常用的镜像源有 :
    阿里云 http://mirrors.aliyun.com/pypi/simple/

    中国科技大学 https://pypi.mirrors.ustc.edu.cn/simple/

    豆瓣(douban) http://pypi.douban.com/simple/

    清华大学 https://pypi.tuna.tsinghua.edu.cn/simple/

    中国科学技术大学 http://pypi.mirrors.ustc.edu.cn/simple/

    pip install lxml -i https://pypi.tuna.tsinghua.edu.cn/simple/

    1. 爬虫基础(贴吧案例)

    爬虫的基本操作
    百度贴吧小案例(request的使用)
    
    # 打开url,封装请求
    from urllib.request import urlopen, Request
    # 将字典封装为网站参数
    from urllib.parse import urlencode
    # 随机UA(防止ip被封)
    from fake_useragent import UserAgent
    
    
    # 获取html
    def get_html(url):
        headers = {
            'User-Agent': UserAgent().chrome
        }
        request = Request(url, headers=headers)
        response = urlopen(request)
        # print(response.read().decode())
        return response.read()
    
    
    # 保存html到本地
    def save_html(filename, html_bytes):
        with open(filename, 'wb') as f:
            print('正在保存' + filename)
            f.write(html_bytes)
    
    
    def main():
        context = input('请输入要下载的内容:')
        num = input('请输入要下载的页数:')
        base_url = 'https://tieba.baidu.com/f?ie=utf-8&{}'
        for pn in range(int(num)):
            args = {
                'pn': pn * 50,
                'kw': context
            }
            args = urlencode(args)
            # print(args)
            # print(base_url.format(args))
            filename = '第' + str(pn + 1) + '页.html'
            print('开始下载' + filename)
            html_bytes = get_html(base_url.format(args))
            save_html(filename, html_bytes)
            print(filename + '下载完成')
    
    
    if __name__ == '__main__':
        main()
    
    

    2. post请求的使用

    给网站发送post请求,传递post参数
    
    from urllib.request import urlopen, Request
    from urllib.parse import urlencode
    from fake_useragent import UserAgent
    
    url = 'http://www.zengqiang.club/admin/login'
    
    form_data = {
        'username': '曾强',
        'password': 'ZQZ981004'
    }
    # print(urlencode(form_data))
    headers = {
        'User-Agent': UserAgent().random
    }
    # print(headers)
    
    f_data = urlencode(form_data)
    request = Request(url, data=f_data.encode(), headers=headers)
    response = urlopen(request)
    print(response.read().decode())
    
    

    3. ajax请求的抓取(豆瓣电影排行榜信息抓取)

    ajax主要是要在network下去查看url
    找到url的参数的含义和规律
    循环的去访问网址获取数据
    
    from urllib.request import Request, urlopen
    from fake_useragent import UserAgent
    
    base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start={}&limit=20'
    
    i = 0
    while True:
        headers = {
            'User-Agent': UserAgent().random
        }
        # 将base_url中{}代表的参数传入,封装为完整的url
        url = base_url.format(i * 50)
        request = Request(url, headers=headers)
        response = urlopen(request)
        info = response.read().decode()
        if len(info) < 10:
            break
        print(info)
        i += 1
    
    

    4. 代理的使用

    使用代理防止过多的访问导致ip被封
    使用网站的其他ip
    
    from urllib.request import Request, build_opener
    from urllib.request import ProxyHandler
    from fake_useragent import UserAgent
    
    url = 'http://httpbin.org/get'
    
    headers = {
        'User-Agent': UserAgent().chrome
    }
    
    request = Request(url, headers=headers)
    
    # 两种方式:(1是购买使用,2是免费的,网上找)
    # handler = ProxyHandler({'http':'username:password@ip:port'})
    # handler = ProxyHandler({'http':'ip:port'})
    handler = ProxyHandler({'http': '39.137.107.98:80'})
    # 封装成自己的opener
    opener = build_opener(handler)
    # 用自定义的opener去发出请求
    response = opener.open(request)
    print(response.read().decode())
    
    

    5. cookie的使用

    使用cookie来完成需要登录而访问的页面
    两种方式:直接使用cookie和保存cookie到文件在加载使用
    
    from urllib.request import Request, HTTPCookieProcessor, build_opener
    from urllib.parse import urlencode
    from fake_useragent import UserAgent
    
    # 登录
    login_url = 'http://www.zengqiang.club/admin/login'
    
    headers = {
        'User-Agent': UserAgent().chrome
    }
    
    form_date = {
        'username': '曾强',
        'password': 'ZQZ981004'
    }
    f_date = urlencode(form_date).encode()
    
    request = Request(login_url, headers=headers, data=f_date)
    handler = HTTPCookieProcessor()
    opener = build_opener(handler)
    opener.open(request)
    
    # 登录成功
    url = 'http://www.zengqiang.club/admin/blogs'
    
    request = Request(url, headers=headers)
    response = opener.open(request)
    print(response.read().decode())
    
    from urllib.request import Request, HTTPCookieProcessor, build_opener
    from urllib.parse import urlencode
    from fake_useragent import UserAgent
    from http.cookiejar import MozillaCookieJar
    
    
    # 登录
    # 保存cookie到文件
    def get_cookie():
        login_url = 'http://www.zengqiang.club/admin/login'
        headers = {
            'User-Agent': UserAgent().chrome
        }
        form_date = {
            'username': '曾强',
            'password': 'ZQZ981004'
        }
        f_date = urlencode(form_date).encode()
        request = Request(login_url, headers=headers, data=f_date)
        cookie_jar = MozillaCookieJar()
        handler = HTTPCookieProcessor(cookie_jar)
        opener = build_opener(handler)
        opener.open(request)
        cookie_jar.save('cookie.txt', ignore_expires=True, ignore_discard=True)
    
    
    # 加载cookie
    # 范文页面
    def use_cookie():
        url = 'http://www.zengqiang.club/admin/blogs'
        headers = {
            'User-Agent': UserAgent().chrome
        }
        request = Request(url, headers=headers)
        cookie_jar = MozillaCookieJar()
        cookie_jar.load('cookie.txt',ignore_expires=True,ignore_discard=True)
        handler = HTTPCookieProcessor(cookie_jar)
        opener = build_opener(handler)
        response = opener.open(request)
        print(response.read().decode())
    if __name__ == '__main__':
        get_cookie()
        use_cookie()
    

    6. URLError的使用

    异常处理try,except
    
    from urllib.request import Request, urlopen
    from fake_useragent import UserAgent
    from urllib.error import URLError
    
    url = 'http://www.zengqiang.club/1.html'
    
    headers = {
        'User-Agent': UserAgent().random
    }
    
    try:
        request = Request(url, headers=headers)
    
        response = urlopen(request)
    
        print(response.read().decode())
    except URLError as e:
        if e.args == ():
            print(e.code)
        else:
            print(e.args[0].errno)
    
    

    7. requests的使用

    requests比urllib更方便,代码更少
    
    import requests
    from fake_useragent import UserAgent
    
    # get请求
    # url = 'https://www.baidu.com/s'
    # headers = {
    #     'User-Agent': UserAgent().chrome
    # }
    # params = {
    #     'wd': '重庆文理学院'
    # }
    # response = requests.get(url, headers=headers, params=params)
    # response.encoding = 'utf-8'
    #
    # print(response.url)
    
    # post请求
    url = 'http://www.zengqiang.club/admin/login'
    form_data = {
        'username': '曾强',
        'password': 'ZQZ981004'
    }
    headers = {
        'User-Agent': UserAgent().random
    }
    response = requests.post(url, data=form_data, headers=headers)
    
    print(response.text)
    
    

    8. re的使用(正则表达式)

    记忆常用的表达式
    

    import re
    
    str = 'I love you6.6 forever'
    print('-------match()从字符串的起始位置开始匹配---------')
    m1 = re.match(r'I', str)
    m2 = re.match(r'w', str)
    m3 = re.match(r'.', str)
    m4 = re.match(r'D', str)
    m5 = re.match(r'S', str)
    m6 = re.match(r'i', str, re.I)
    print(m6.group())
    
    print('-------serach()扫描整个字符串并返回第一个成功的匹配---------')
    s1 = re.search(r'love', str)
    s2 = re.search(r'lw+', str)
    s3 = re.search(r'yw+.d', str)
    print(s3.group())
    
    print('-------findAll()查找全部---------')
    f1 = re.findall(r'o', str)
    print(f1)
    
    print('--------练习---------')
    str1 = '<div><a href="http://www.python.com">python官网</a></div>'
    t1 = re.findall(r'pw+[u4e00-u9fa5]', str1)
    t2 = re.findall(r'<a href="http://www.python.com">(.+)</a>', str1)
    t3 = re.findall(r'<a href="(.+)">', str1)
    print(t3)
    
    print('---------sub() 替换字符串-------')
    su1 = re.sub(r'<div>(.+)</div>', r'<span>1</span>', str1)
    print(su1)
    
    

    9. 使用re爬取本网站的首页的博客标题

    import requests
    from fake_useragent import UserAgent
    import re
    
    
    url = 'http://www.zengqiang.club/'
    
    headers = {
        'User-Agent': UserAgent().random
    }
    
    response = requests.get(url, headers=headers)
    
    # print(response.text)
    info = response.text
    result = re.findall(r'<a href="/blog/d+" target="_blank" class="m-black m-text-thin">(.+)</a>', info)
    print(result)
    
    

    10. bs4的使用

    方便提取html代码中我们需要的内容
    
    from bs4 import BeautifulSoup
    from bs4.element import Comment
    
    # 这里需要安装lxml模块
    # 国内常用的镜像源有 :
    # 阿里云 http://mirrors.aliyun.com/pypi/simple/
    # 中国科技大学 https://pypi.mirrors.ustc.edu.cn/simple/
    # 豆瓣(douban) http://pypi.douban.com/simple/
    # 清华大学 https://pypi.tuna.tsinghua.edu.cn/simple/
    # 中国科学技术大学 http://pypi.mirrors.ustc.edu.cn/simple/
    # pip install lxml -i https://pypi.tuna.tsinghua.edu.cn/simple/
    str = '''
    <title>尚学堂</title>
    <div class='info' float='left'>Welcome to SXT</div>
    <div class='info' float='right'>
        <span>Good Good Study</span>
        <a href='www.bjsxt.cn'></a>
        <strong><!--没用--></strong>
    </div>
    '''
    
    soup = BeautifulSoup(str, 'lxml')
    print(soup.title)
    print(soup.div)
    print(soup.div.attrs)
    print(soup.div.get('class'))
    print(soup.div.get('float'))
    print(soup.a['href'])
    print(soup.div.string)
    print(soup.div.text)
    print(soup.strong.string)
    print(type(soup.strong.string))
    
    if type(soup.strong.string) == Comment:
        print(soup.strong.string)
        print(soup.strong.prettify())
    else:
        print(soup.strong.text)
    
    str1 = '''
    <title id="title">尚学堂</title>
    <div class='info' id="info" float='left'>Welcome to SXT</div>
    <div class='info' float='right'>
        <span>Good Good Study</span>
        <a href='www.bjsxt.cn'></a>
        <strong><!--没用--></strong>
    </div>
    '''
    
    print('------------find_all()-------------')
    soup1 = BeautifulSoup(str1, 'lxml')
    print(soup1.find_all('title'))
    print(soup1.find_all(id='title'))
    print(soup1.find_all(class_='info'))  # class是关键字
    print(soup1.find_all(attrs={'float': 'left'}))
    
    print('------------select() css选择器-------------')
    print(soup1.select('title'))
    print(soup1.select('#title'))
    print(soup1.select('.info'))
    print(soup1.select('div > span'))  # < 两边要有空格
    print(soup1.select('div span'))
    print(soup1.select('div'))
    print(soup1.select('div')[1])
    print(soup1.select('div')[1].select('span'))
    print(soup1.select('title')[0].text)
    
    

    11. xpath的使用

    xpath用于获取html想要的内容
    爬取起点中文网的部分书籍的书名和作者
    
    from lxml import html
    import requests
    from fake_useragent import UserAgent
    
    url = "https://www.qidian.com/rank/yuepiao?chn=21"
    headers = {
        'User_Agent': UserAgent().random
    }
    response = requests.get(url, headers=headers)
    
    etree = html.etree
    
    e = etree.HTML(response.text)
    
    names = e.xpath('//h4/a/text()')
    authors = e.xpath('//p[@class="author"]/a[1]/text()')
    
    # for num in range(len(names)):
    #     print(names[num], ":", authors[num])
    
    for name, author in zip(names, authors):
        print(name, ":", author)
    
    # print(names)
    # print(authors)
    
    

    12. pyquery的使用

    pyquer用于获取html想要的内容
    爬取西刺代理的ip数据
    
    from pyquery import PyQuery as pq
    import requests
    from fake_useragent import UserAgent
    
    url = 'https://www.xicidaili.com/nn/'
    
    headers = {
        'User-Agent': UserAgent().chrome
    }
    
    response = requests.get(url, headers=headers)
    
    doc = pq(response.text)
    
    strs = doc('#ip_list tr')
    
    for num in range(1, len(strs)):
        ip = strs.eq(num).find('td').eq(1).text()
        port = strs.eq(num).find('td').eq(2).text()
        type = strs.eq(num).find('td').eq(5).text()
        print(ip, ":", port, "----", type)
    
    

    13. json的使用

    主要是json与string之间的转换
    
    import json
    
    str = '{"name":"我的小世界"}'
    print(type(str))
    
    # 将字符串转为json对象
    obj = json.loads(str)
    print(type(obj), ":", obj)
    
    # 将json对象转为字符串
    str1 = json.dumps(obj, ensure_ascii=False)
    print(type(str1), ":", str1)
    
    # 保存json到文件
    json.dump(obj, open('json.txt', 'w', encoding='utf-8'), ensure_ascii=False)
    
    # 从文件中提取数据
    str2 = json.load(open('json.txt', encoding='utf-8'))
    print(str2)
    

    14. jsonpath的使用

    爬取json数据中我们需要的内容
    
    from jsonpath import jsonpath
    import requests
    from fake_useragent import UserAgent
    import json
    
    # json在线解析:https://www.json.cn/
    url = 'https://www.lagou.com/lbs/getAllCitySearchLabels.json'
    
    headers = {
        'User-Agent': UserAgent().chrome
    }
    
    response = requests.get(url, headers=headers)
    # 两种方式将response转换为json对象
    city_names = jsonpath(json.loads(response.text), '$..name')
    city_codes = jsonpath(response.json(), '$..code')
    
    for city_name, city_code in zip(city_names, city_codes):
        print(city_name, ":", city_code)
    
    

    15. 多线程的使用

    多线程主要是为了提高爬取的效率
    爬取段子网的段子数据(面向对象)
    
    from threading import Thread
    from fake_useragent import UserAgent
    import requests
    from lxml import html
    from queue import Queue
    
    
    # 爬取网页类
    class Spider_html(Thread):
        def __init__(self, url_queue, html_queue):
            Thread.__init__(self)
            self.url_queue = url_queue
            self.html_queue = html_queue
    
        def run(self):
            headers = {
                'User-Agent': UserAgent().random
            }
            while self.url_queue.empty() == False:
                url = self.url_queue.get()
                response = requests.get(url, headers=headers)
                if response.status_code == 200:
                    self.html_queue.put(response.text)
    
    
    # 解析类
    class ParseInfo(Thread):
        def __init__(self, html_queue):
            Thread.__init__(self)
            self.html_queue = html_queue
    
        def run(self):
            etree = html.etree
            while self.html_queue.empty() == False:
                e = etree.HTML(self.html_queue.get())
                contents = e.xpath('//div[@class="post-content"]/p/text()')
                # print(contents)
                with open('duanzi.txt','a',encoding='utf-8')as f:
                    for content in contents:
                        info = content
                        # 控制一行一个段子方便查看
                        f.write(info+'
    ')
    
    if __name__ == '__main__':
        # 存储url
        url_queue = Queue()
        # 存储内容html
        html_queue = Queue()
        base_url = 'https://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}'
        for i in range(1, 11):
            new_url = base_url.format(i)
            url_queue.put(new_url)
            # print(new_url)
    
        # 爬取网页
        spider_html_list = []
        # 开启三个线程
        for i in range(0, 3):
            spider1 = Spider_html(url_queue, html_queue)
            spider_html_list.append(spider1)
            spider1.start()
    
        for spider_html in spider_html_list:
            spider_html.join()
    
        # 解析网页,获取需要的内容
        parse_list = []
        for i in range(0, 3):
            parse = ParseInfo(html_queue)
            parse_list.append(parse)
            parse.start()
        for parse in parse_list:
            parse.join()
    
    

    16. 云打码的使用

    云打码主要用于处理登录时输入的验证码
    需要给一点钱和注册账号
    http://www.yundama.com/
    
    # 打码工具类
    import http.client, mimetypes, urllib, json, time, requests
    from PIL import Image
    
    ######################################################################
    
    class YDMHttp:
        apiurl = 'http://api.yundama.com/api.php'
        username = ''
        password = ''
        appid = ''
        appkey = ''
    
        def __init__(self, username, password, appid, appkey):
            self.username = username
            self.password = password
            self.appid = str(appid)
            self.appkey = appkey
    
        def request(self, fields, files=[]):
            response = self.post_url(self.apiurl, fields, files)
            response = json.loads(response)
            return response
    
        def balance(self):
            data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['balance']
            else:
                return -9001
    
        def login(self):
            data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['uid']
            else:
                return -9001
    
        def upload(self, filename, codetype, timeout):
            data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
            file = {'file': filename}
            response = self.request(data, file)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['cid']
            else:
                return -9001
    
        def result(self, cid):
            data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey, 'cid': str(cid)}
            response = self.request(data)
            return response and response['text'] or ''
    
        def decode(self, filename, codetype, timeout):
            cid = self.upload(filename, codetype, timeout)
            if (cid > 0):
                for i in range(0, timeout):
                    result = self.result(cid)
                    if (result != ''):
                        return cid, result
                    else:
                        time.sleep(1)
                return -3003, ''
            else:
                return cid, ''
    
        def report(self, cid):
            data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
            response = self.request(data)
            if (response):
                return response['ret']
            else:
                return -9001
    
        def post_url(self, url, fields, files=[]):
            for key in files:
                files[key] = open(files[key], 'rb');
            res = requests.post(url, files=files, data=fields)
            return res.text
    
    
    ######################################################################
    def get_code(filename):
        # 用户名
        username = 'zq666_yh'
    
        # 密码
        password = 'ZQZ981004'
    
        # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
        appid = 10039
    
        # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
        appkey = 'f6248169a3f9857b57e778c52d9f5de2'
    
        # 图片文件
        filename = filename
    
        # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
        codetype = 1005
    
        # 超时时间,秒
        timeout = 60
    
        # 检查
        if (username == 'username'):
            print('请设置好相关参数再测试')
        else:
            # 初始化
            yundama = YDMHttp(username, password, appid, appkey)
    
            # 登陆云打码
            uid = yundama.login();
            # print('uid: %s' % uid)
    
            # 查询余额
            balance = yundama.balance();
            # print('balance: %s' % balance)
    
            # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
            cid, result = yundama.decode(filename, codetype, timeout);
            # print('cid: %s, result: %s' % (cid, result))
            return result
        ######################################################################
    
    
    if __name__ == '__main__':
        img = 'yzm1.jpg'
        code = get_code(img)
        print(code)
    
    # 使用
    import requests
    from fake_useragent import UserAgent
    from 爬虫学习.ydm_util import get_code
    
    
    def get_image():
        img_url = 'http://www.yundama.com/index/captcha'
        response = session.get(img_url, headers=headers)
        with open('yzm.jpg', 'wb')as f:
            f.write(response.content)
        code = get_code('yzm.jpg')
        print(code)
        return code
    
    
    def do_login(code):
        login_url = 'http://www.yundama.com/index/login?'
        f_data = {
            'username': 'zq666_yh',
            'password': 'ZQZ981004',
            'utype': '1',
            'vcode': code
        }
        response = session.get(login_url, headers=headers, params=f_data)
        print(response.text)
    
    
    # 三个操作必须在同一个会话下进行
    if __name__ == '__main__':
        session = requests.Session()
        index_url = 'http://www.yundama.com/'
        headers = {
            'User-Agent': UserAgent().random
        }
        response = session.get(index_url, headers=headers)
        code = get_image()
        do_login(code)
    
    
    

    17. selenium的使用

    selenium主要你用于打开浏览器测试,并控制浏览器进行一些操作
    需要现在python安装目录下的script中放置浏览器驱动:如chromedriver.exe
    
    from selenium import webdriver
    
    chrome = webdriver.Chrome()
    
    chrome.get('http://www.zengqiang.club')
    
    # chrome.save_screenshot('zqclub.jpg')
    
    # html = chrome.page_source
    # print(html)
    
    id_content = chrome.find_element_by_id('run_time').text
    print(id_content)
    
    chrome.find_element_by_name('query').send_keys('爬虫')
    chrome.find_element_by_class_name('search').click()
    chrome.save_screenshot('爬虫.jpg')
    print(chrome.current_url)
    # 关闭当前页
    chrome.close()
    
    # 获取当前页面Cookie
    print(chrome.get_cookies())
    
    chrome.quit()
    

    18. 练习-爬取360电影信息

    练习使用四种爬取方式xpath,re,bs4,pyquery
    
    import requests
    from fake_useragent import UserAgent
    from lxml import html
    from random import randint
    from time import sleep
    from bs4 import BeautifulSoup
    import re
    from pyquery import PyQuery
    
    
    # 获取页面的html代码
    def get_html(url):
        headers = {
            'User-Agent': UserAgent().random
        }
        # 暂停2-6秒,使更像人的操作
        sleep(randint(2, 6))
        response = requests.get(url, headers=headers)
        response.encoding = 'utf-8'
        if response.status_code == 200:
            return response.text
        else:
            return None
    
    
    # 解析首页(得到电影信息的url)
    def parse_index(index_html):
        # ------pq--------
        doc = PyQuery(index_html)
        moive_a = doc('ul.list.g-clear a')
        moive_urls = []
        for a in moive_a:
            moive_urls.append(a.attrib['href'])
    
        # ------re--------
        # moive_urls = re.findall(r'<a class="js-tongjic" href="(.+)">', index_html)
    
        # ------bs4--------
        # soup = BeautifulSoup(index_html, 'lxml')
        # moive_a = soup.select('ul.list.g-clear a')
        # # print(moive_a)
        # moive_urls = []
        # for a in moive_a:
        #     moive_urls.append(a['href'])
    
        # ------xpath--------
        # etree = html.etree
        # e = etree.HTML(index_html)
        # moive_urls = e.xpath('//ul[@class="list g-clear"]//a/@href')
    
        return ['https://www.360kan.com{}'.format(url) for url in moive_urls]
    
    
    # 解析电影信息,得到需要的内容
    def parse_info(movie_html):
        # ------pq--------
        doc = PyQuery(movie_html)
        name = doc('h1').text()
        types = doc('p.item > a.cat').text()
        actors = doc('p.item.item-actor > a').text()
    
        # ------re--------
        # name = re.findall(r'<h1>(.+)</h1>', movie_html)[0]
        # types = re.findall(r'class="cat.+href.+">(.+)</', movie_html)
        # actors = re.findall(r'<a class="name" href=".+">(.+)</a>', movie_html)
    
        # ------bs4--------
        # soup = BeautifulSoup(movie_html, 'lxml')
        # name = soup.select('h1')[0].text
        # type = soup.select('p.item')[0].select('a')
        # types = []
        # for t in type:
        #     types.append(t.text)
        # actor = soup.select('p.item.item-actor')[0].select('a')
        # actors = []
        # for a in actor:
        #     actors.append(a.text)
    
        # ------xpath--------
        # etree = html.etree
        # e = etree.HTML(movie_html)
        # name = e.xpath('//h1/text()')[0]
        # types = e.xpath('//p[@class="item"][1]/a/text()')
        # actors = e.xpath('//p[@class="item item-actor"]/a/text()')
        return {
            'name': name,
            'types': types,
            'actors': actors
        }
    
    
    # 主方法,遍历电影url,打印爬取的数据
    def main():
        index_url = 'https://www.360kan.com/dianying/list.php?year=all&area=all&act=all&cat=all'
        index_html = get_html(index_url)
        moive_urls = parse_index(index_html)
        print(moive_urls)
        for url in moive_urls:
            moive_html = get_html(url)
            moive = parse_info(moive_html)
            print(moive)
    
    
    if __name__ == '__main__':
        main()
    
    

    19. 练习-爬取虎牙直播正在直播的主播信息

    使用selenium爬取
    
    from selenium import webdriver
    from time import sleep
    
    driver = webdriver.Chrome()
    
    url = 'https://www.huya.com/g/2356'
    
    driver.get(url)
    
    num = 1
    while True:
        print('第', str(num), '页------------')
        num += 1
        sleep(5)
        html = driver.page_source
        titles = driver.find_elements_by_xpath('//a[@class="title new-clickstat j_live-card"]')
        anthors = driver.find_elements_by_xpath('//i[@class="nick"]')
        audiences = driver.find_elements_by_xpath('//i[@class="js-num"]')
    
        for title, anthor, audience in zip(titles, anthors, audiences):
            print(title.text, '---', anthor.text, '---', audience.text)
        if html.find('laypage_next') != -1:
            driver.find_element_by_xpath('//a[@class="laypage_next"]').click()
        else:
            break
    
    

    20. selenium滚动条的使用

    有些网页需要滚动才会显示所有内容
    爬取京东商品信息
    
    from selenium import webdriver
    from time import sleep
    
    url = 'https://search.jd.com/Search?keyword=iqoo&enc=utf-8&pvid=1c71f2514c724500b5c4e7f4dc58c1f2'
    
    driver = webdriver.Chrome()
    driver.get(url)
    
    js = 'document.documentElement.scrollTop=100000'
    driver.execute_script(js)
    
    sleep(3)
    html = driver.page_source
    
    names = driver.find_elements_by_xpath('//div[@class="gl-i-wrap"]//a/em')
    prices = driver.find_elements_by_xpath('//div[@class="gl-i-wrap"]//strong/i')
    print(len(names))
    
    for name, price in zip(names, prices):
        print(name.text, ':', price.text)
    
    

    21. 练习-图虫网首页所有组图的爬取

    import requests
    from fake_useragent import UserAgent
    from lxml import html
    from selenium import webdriver
    
    
    def get_group_urls():
        driver = webdriver.Chrome()
        index_url = 'https://tuchong.com/'
        driver.get(index_url)
        index_html = driver.page_source
        # print(index_html)
        etree = html.etree
        e = etree.HTML(index_html)
        group_urls = e.xpath('//div[@class="post-item"]/a[1]/@href')
        return group_urls
    
    
    def get_group_html(group_urls):
        etree = html.etree
        headers = {'User_Agent': UserAgent().random}
        group_num = 1
        for url in group_urls:
            group_name = 'group' + str(group_num)
            group_num += 1
            response = requests.get(url, headers=headers)
            e = etree.HTML(response.text)
            # print(response.text)
            img_urls = e.xpath('//article[@class="post-content"]//img[@class="multi-photo-image"]/@src')
            print(img_urls)
            for img_url in img_urls:
                img_name = img_url[img_url.rfind('/') + 1:]
                save_img(img_url, group_name, img_name)
    
    
    def save_img(img_url, group_num, img_name):
        headers = {'User_Agent': UserAgent().random}
        response = requests.get(img_url, headers=headers)
        group_name = 'group' + str(group_num)
        with open('img/' + group_name + '-' + img_name, 'wb') as f:
            f.write(response.content)
    
    
    def main():
        group_urls = get_group_urls()
        get_group_html(group_urls)
    
    
    if __name__ == '__main__':
        main()
    
    

    22. 双色球案例(存入数据到数据库)

    链接数据库
    并插入爬取的数据
    
    import requests
    from fake_useragent import UserAgent
    from lxml import html
    import pymysql
    
    
    def get_html(url):
        headers = {
            'User-Agent': UserAgent().random
        }
        response = requests.get(url, headers=headers)
        return response.text
    
    
    def save_mysql(trs, date_time):
        client = pymysql.connect(host='localhost', port=3306, user='root', password='ZQZ981004', charset='utf8',
                                 db='python')
        print('数据库链接成功')
        cursor = client.cursor()
        sql = 'insert into double_ball values(0,%s,%s,%s)'
    
        for tr, time in zip(trs, date_time):
            # 提取红球
            red_ball = '-'.join(tr.xpath('./td[@class="chartBall01"]/text()'))
            # 提取蓝球
            blue_ball = tr.xpath('./td[@class="chartBall02"]/text()')[0]
            print("第 " + time + "期—红球是:" + red_ball + " 蓝球:" + blue_ball)
            cursor.execute(sql, [time, red_ball, blue_ball])
            client.commit()
    
        cursor.close()
        client.close()
        print('保存数据完成')
    
    
    def main():
        url = 'https://datachart.500.com/ssq/'
        html_ = get_html(url)
        etree = html.etree
        e = etree.HTML(html_)
        data_time = e.xpath('//tbody[@id="tdata"]/tr/td[@align="center"]/text()')
        trs = e.xpath('//tbody[@id="tdata"]/tr[not(@class)]')
        save_mysql(trs, data_time)
    
    
    if __name__ == '__main__':
        main()
    
    

    23. 爬虫新写法(规范,类)

    分离所有的方法
    使用类
    更专业
    
    import requests
    from fake_useragent import UserAgent
    from lxml import etree
    
    
    # url管理
    class URLManager(object):
        def __init__(self):
            self.new_url = []
            self.old_url = []
    
        # 获取一个url
        def get_new_url(self):
            url = self.new_url.pop()
            self.old_url.append(url)
            return url
    
        # 增加一个url
        def add_new_url(self, url):
            if url not in self.new_url and url and url not in self.old_url:
                self.new_url.append(url)
    
        # 增加多个url
        def add_new_urls(self, urls):
            for url in urls:
                self.add_new_url(url)
    
        # 判断是否还有可以爬取的url
        def has_new_url(self):
            return self.get_new_url_size() > 0
            # 获取可以爬取的数量
    
        def get_new_url_size(self):
            return len(self.new_url)
    
        # 获取已经爬取的数量
        def get_old_url_size(self):
            return len(self.old_url)
    
    
    # 爬取
    class Downloader:
        def download(self, url):
            response = requests.get(url, headers={"User-Agent": UserAgent().random})
            if response.status_code == 200:
                response.encoding = 'utf-8'
                return response.text
            else:
                return None
    
    
    # 解析
    class Parser:
        def parse(self, html):
            e = etree.HTML(html)
            datas = self.parse_info(e)
            #datas = [span.xpath('string(.)') for span in e.xpath('//div[@class="content"]/span[1]')]
            urls = self.parse_urls(e)
            #urls = [ 'https://www.qiushibaike.com{}'.format(url) for url in e.xpath('//ul[@class="pagination"]/li/a/@href')]
            return datas, urls
    
        def parse_info(self, e):
            spans = e.xpath('//div[@class="content"]/span[1]')
            datas = []
            for span in spans:
                datas.append(span.xpath('string(.)'))
            return datas
    
        def parse_urls(self, e):
            base_url = 'https://www.qiushibaike.com{}'
            urls = []
            for url in e.xpath('//ul[@class="pagination"]/li/a/@href'):
                urls.append(base_url.format(url))
            return urls
    
    
    # 数据处理
    class DataOutPut:
        def save(self, datas):
            with open('duanzi.txt', 'a', encoding='utf-8') as f:
                for data in datas:
                    f.write(data)
    
    
    # 调度
    class DiaoDu:
        def __init__(self):
            self.downloader = Downloader()
            self.url_manager = URLManager()
            self.parser = Parser()
            self.data_saver = DataOutPut()
    
        def run(self, url):
            self.url_manager.add_new_url(url)
            while self.url_manager.has_new_url():
                url = self.url_manager.get_new_url()
                html = self.downloader.download(url)
                data, urls = self.parser.parse(html)
                self.data_saver.save(data)
                self.url_manager.add_new_urls(urls)
    
    
    if __name__ == '__main__':
        diao_du = DiaoDu()
        diao_du.run('https://www.qiushibaike.com/text/page/1/')
    
    

    资料

    链接:https://pan.baidu.com/s/10e8PphvR7Um0-WPAylw8Yw

    提取码:h8i8

    版权声明:本文为博主原创文章,转载请附上博文链接!
  • 相关阅读:
    团队开发第二阶段
    每日日报
    每日日报
    每日日报
    每日日报
    每日日报
    C++类class和结构体struct区别
    c++简单的类的建立与参数的两种传递方法
    C++ 使用delete删除指针
    暂存
  • 原文地址:https://www.cnblogs.com/zq98/p/15027998.html
Copyright © 2011-2022 走看看