zoukankan      html  css  js  c++  java
  • 爬虫重要案例总结与回顾

    一.爬取化妆品生产许可证相关

    import requests
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }
    
    
    def get_ID(pages):
        url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList'
    
        ID_list = []
    
        for every_page in range(1, pages + 1):
    
            params = {
                "on": "true",
                "page": str(every_page),
                "pageSize": "15",
                "productName": "",
                "conditionType": "1",
                "applyname": "",
                "applysn": "",
            }
    
            data = requests.post(url=url, params=params, headers=headers).json()
    
            for each_dict in data["list"]:
                ID_list.append(each_dict['ID'])
    
            import time
            time.sleep(0.1)
    
        return ID_list
    
    
    def get_all_detail(ID_list):
        url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById"
    
        with open('化妆品生产许可证信息.txt', 'a', encoding='utf-8') as f:
            for ID in ID_list:
                params = {
                    "id": ID
                }
    
                data = requests.post(url=url, params=params, headers=headers).text
    
                f.write(data + '
    ')
    
                import time
                time.sleep(0.1)
    
        print("数据写入文件成功!")
    
    
    ID_list = get_ID(10)
    get_all_detail(ID_list)
    

    二.爬取肯德基餐厅查询指定地点

    import requests
    
    # UA伪装
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }
    
    # 指定url
    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
    
    # 处理参数
    area = input('请输入一个地名: ')
    params = {
        "cname": "",
        "pid": "",
        "keyword": area,
        "pageIndex": "1",
        "pageSize": "10",
    }
    
    # 发起请求,获取响应数据
    data = requests.get(url=url, params=params, headers=headers).json()
    
    print(data)
    

    三.正则 - 爬取糗事百科图片

    import re
    import os
    import time
    import requests
    from urllib import request
    
    if not os.path.exists('./qiutu'):
        os.mkdir('/qiutu')
    
    url = "https://www.qiushibaike.com/pic/"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    page_text = requests.get(url=url, headers=headers).text
    
    if not os.path.exists('qiutu'):
        os.mkdir('qiutu')
    
    img_url_list = re.findall('<div class="thumb">.*?<img src="(.*?)" alt.*?</div>', page_text, re.S)
    
    for img_url in img_url_list:
        img_url = 'https:' + img_url
        img_name = img_url.split('/')[-1]
        img_path = './qiutu/' + img_name
        request.urlretrieve(img_url, img_path)
        print(img_path, '下载成功!')
    
        time.sleep(0.1)
    

    四.bs4 - 爬取诗词名句网站中三国演义小说

    import requests
    from bs4 import BeautifulSoup
    
    headers={
             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
         }
    def parse_content(url):
        #获取标题正文页数据
        page_text = requests.get(url,headers=headers).text
        soup = BeautifulSoup(page_text,'lxml')
        #解析获得标签
        ele = soup.find('div',class_='chapter_content')
        content = ele.text #获取标签中的数据值
        return content
    
    if __name__ == "__main__":
         url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
         reponse = requests.get(url=url,headers=headers)
         page_text = reponse.text
    
         #创建soup对象
         soup = BeautifulSoup(page_text,'lxml')
         #解析数据
         a_eles = soup.select('.book-mulu > ul > li > a')
         print(a_eles)
         cap = 1
         for ele in a_eles:
             print('开始下载第%d章节'%cap)
             cap+=1
             title = ele.string
             content_url = 'http://www.shicimingju.com'+ele['href']
             content = parse_content(content_url)
    
             with open('./sanguo.txt','w') as fp:
                 fp.write(title+":"+content+'
    
    
    
    
    ')
                 print('结束下载第%d章节'%cap)
    

    五.xpath解析相关实例

    1.解析58二手房的相关数据

    import requests
    from lxml import etree
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    url = 'https://bj.58.com/ershoufang/?PGTID=0d200001-0000-1376-eb9f-25ca6cacedce&ClickID=1'
    
    page_text = requests.get(url=url, headers=headers).text
    
    # 数据解析
    tree = etree.HTML(page_text)
    
    li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
    
    if __name__ == '__main__':
    
        for li in li_list:
            title = li.xpath('./div[2]/h2/a/text()')[0].strip()
            print(title)
    

    2.下载彼岸图网中的图片数据:中文乱码问题

    import requests, os
    from lxml import etree
    from urllib import request
    
    # 创建一个空文件夹,用于存放图片数据
    if not os.path.exists('./images'):
        os.mkdir('./images')
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    url = 'http://pic.netbian.com/4kmeinv/'
    
    # 获取响应数据
    page_text = requests.get(url=url, headers=headers).text
    
    # 实例化etree对象
    tree = etree.HTML(page_text)
    # xpath解析
    li_list = tree.xpath('//div[@class="slist"]/ul/li')
    
    for li in li_list:
        img_name = li.xpath('./a/img/@alt')[0]
        # 处理中文乱码问题
        img_name = img_name.encode('ISO-8859-1').decode('gbk')
    
        img_url = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
        img_path = './images/' + img_name + '.jpg'
        request.urlretrieve(url=img_url, filename=img_path)
        print("下载完成!!!")
    

    3.下载煎蛋网中图片数据(数据经过加密)

    from lxml import etree
    from urllib import request
    import requests
    import base64
    import os
    
    if not os.path.exists('./jiandan'):
        os.mkdir('./jiandan')
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    url = 'http://jandan.net/ooxx/page-62'
    
    page_text = requests.get(url=url, headers=headers).text
    
    tree = etree.HTML(page_text)
    
    img_hash_list = tree.xpath('//span[@class="img-hash"]/text()')
    
    for img_hash in img_hash_list:
        # 图片的真实url(需要用base64解码)
        img_url = "http:" + base64.b64decode(img_hash).decode('utf8')
    
        # 设置图片存放路径
        img_path = './jiandan/' + img_url.split('/')[-1]
    
        # 持久化存储
        request.urlretrieve(url=img_url, filename=img_path)
        print("下载完成!!", img_url)
    
    print('over!')
    

    4.下载站长素材中的简历模板数据

    import requests
    import random
    import os
    from lxml import etree
    
    if not os.path.exists('./jianli'):
        os.mkdir('./jianli')
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    for i in range(1, 6):
        # 1.指定url
        if i == 1:
            url = 'http://sc.chinaz.com/jianli/free.html'
        else:
            url = f'http://sc.chinaz.com/jianli/free_{i}.html'
        # 2.发起请求
        response = requests.get(url=url, headers=headers)
        # 2.1编码格式
        response.encoding = 'utf8'
        # 3.获取响应的文本信息
        page_text = response.text
        # 4.实例化etree对象,将页面源码加载到该对象中
        tree = etree.HTML(page_text)
        # 5.使用xpath函数进行定位
        a_list = tree.xpath('//a[@class="title_wl"]')
    
        for a in a_list:
            jianli_name = a.xpath('./text()')[0]
            jianli_url = a.xpath('./@href')[0]
            print(jianli_name)
            print(jianli_url)
            print('----------------------------------------------')
    
            response2 = requests.get(url=jianli_url, headers=headers)
            response2.encoding = 'utf8'
            each_jinali_text = response2.text
            tree2 = etree.HTML(each_jinali_text)
            # 所有下载地址列表
            download_url_list = tree2.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
            # 随机选择一个下载地址
            download_url = random.choice(download_url_list)
            # 获取响应的数据
            res = requests.get(url=download_url, headers=headers).content
            # 持久化存储
            filepath = './jianli/' + jianli_name + '.rar'
            with open(filepath, 'wb') as f:
                f.write(res)
            print(jianli_name, '下载完成!')
    
    print('over!')
    

    5.解析所有城市名称

    """
    解析所有城市名称
    https://www.aqistudy.cn/historydata/
    """
    
    import requests
    from lxml import etree
    
    url = 'https://www.aqistudy.cn/historydata/'
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf8'
    page_text = response.text
    tree = etree.HTML(page_text)
    
    hot_city_list = tree.xpath('//div[@class="hot"]/div[2]/ul/li/a/text()')
    all_city_list = tree.xpath('//div[@class="all"]/div[2]/ul/div[2]/li/a/text()')
    # 可以用管道符连接以上二者
    cityname_list = tree.xpath(
        '//div[@class="hot"]/div[2]/ul/li/a/text() | //div[@class="all"]/div[2]/ul/div[2]/li/a/text()')
    
    print('-----------------------------------------------------------')
    print(hot_city_list)
    
    print('***********************************************************')
    print(all_city_list)
    
    print('###########################################################')
    print(cityname_list)
    

    六.图片懒加载

    """
    图片懒加载概念:
        - 图片懒加载是一种网页优化技术.图片作为一种网络资源,
        在被请求时也与普通静态资源一样,将占用网络资源,
        而一次性将整个页面的所有图片加载完,
        将大大增加页面的首屏加载时间.为了解决这种问题,通过前后端配合,
        使图片仅在浏览器当前视窗内出现时才加载该图片,
        达到减少首屏图片请求数的技术就被称为"图片懒加载".
    
    网站一般如何实现图片懒加载技术呢?
        - 在网页源码中,在img标签中首先会使用一个"伪属性"(通常使用src2,original...)
        去存放真正的图片链接而并非是直接存放在src属性中.当图片出现到页面的可视化区域中,
        会动态将伪属性替换成src属性,完成图片的加载.
    """
    
    import os
    import requests
    from urllib import request
    from lxml import etree
    
    if not os.path.exists('./images'):
        os.mkdir('./images')
    
    url = 'http://sc.chinaz.com/tupian/'
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf8'
    page_text = response.text
    tree = etree.HTML(page_text)
    img_list = tree.xpath('//div[@class="box picblock col3"]/div/a/img')
    
    for img in img_list:
        img_name = img.xpath('./@alt')[0]
        img_url = img.xpath('./@src2')[0]
    
        file_path = './images/' + img_name + '.jpg'
        request.urlretrieve(img_url, file_path)
        print("下载完成!!!", img_name)
    
    print('over!')
    
    """
    站长素材案例后续分析:
        - 通过细致观察页面的结构后发现,网页中图片的链接是存储在了src2这个伪属性中
    """
    

    七.使用云打码平台识别验证码

    ydmhttp.py:

    import http.client, mimetypes, urllib, json, time, requests
    
    
    class YDMHttp:
        apiurl = 'http://api.yundama.com/api.php'
        username = ''
        password = ''
        appid = ''
        appkey = ''
    
        def __init__(self, username, password, appid, appkey):
            self.username = username
            self.password = password
            self.appid = str(appid)
            self.appkey = appkey
    
        def request(self, fields, files=[]):
            response = self.post_url(self.apiurl, fields, files)
            response = json.loads(response)
            return response
    
        def balance(self):
            data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['balance']
            else:
                return -9001
    
        def login(self):
            data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['uid']
            else:
                return -9001
    
        def upload(self, filename, codetype, timeout):
            data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
            file = {'file': filename}
            response = self.request(data, file)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['cid']
            else:
                return -9001
    
        def result(self, cid):
            data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey, 'cid': str(cid)}
            response = self.request(data)
            return response and response['text'] or ''
    
        def decode(self, filename, codetype, timeout):
            cid = self.upload(filename, codetype, timeout)
            if (cid > 0):
                for i in range(0, timeout):
                    result = self.result(cid)
                    if (result != ''):
                        return cid, result
                    else:
                        time.sleep(1)
                return -3003, ''
            else:
                return cid, ''
    
        def report(self, cid):
            data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
            response = self.request(data)
            if (response):
                return response['ret']
            else:
                return -9001
    
        def post_url(self, url, fields, files=[]):
            for key in files:
                files[key] = open(files[key], 'rb');
            res = requests.post(url, files=files, data=fields)
            return res.text
    
    # 该函数用于获取识别后的验证码
    def getCodeData(username, password, filename, codetype, timeout):
        # 用户名
        username = username
    
        # 密码
        password = password
    
        # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
        appid = 1234
    
        # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
        appkey = 'xxx'
    
        # 图片文件
        filename = filename
    
        # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
        codetype = codetype
    
        # 超时时间,秒
        timeout = timeout
    
        # 检查
        if (username == 'username'):
            print('请设置好相关参数再测试')
        else:
            # 初始化
            yundama = YDMHttp(username, password, appid, appkey)
    
            # 登陆云打码
            uid = yundama.login();
            print('uid: %s' % uid)
    
            # 查询余额
            balance = yundama.balance();
            print('balance: %s' % balance)
    
            # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
            cid, result = yundama.decode(filename, codetype, timeout);
            print('cid: %s, result: %s' % (cid, result))
        return result
    

    八.模拟登录人人网,爬取个人中心页面数据

    """
    cookie的处理:
    1. 手动处理
        - cookie封装到headers
    2. 自动处理
        - (1)获取一个session对象
        - (2)使用session对象进行请求的发送
        - (3)作用: 在使用session进行请求发送的过程中如果产生了cookie,
                则cookie会被自动存储到session对象中.
    """
    
    
    from ydmhttp import getCodeData # 识别人人网中的验证码图片
    import requests
    from urllib import request
    from lxml import etree
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    url = 'http://www.renren.com'
    
    page_text = requests.get(url=url, headers=headers).text
    tree = etree.HTML(page_text)
    code_img_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
    
    if code_img_url:
        request.urlretrieve(url=code_img_url, filename='./code.jpg')
        # 识别验证码图片中的数据值,2004表示4位纯汉字,其他类型代码参考云打码帮助文档
        # 查看验证码类型: http://www.yundama.com/price.html
        code_data = getCodeData('username', 'password', './code.jpg', 2004, 30)
        print(code_data)  # code_data为识别结果
    else:
        print('不需要识别验证码')
        code_data = ''
    
    # 指定登录请求的url
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019241516668'
    data = {	# 以下数据通过fiddler抓包工具抓取到网页后即可拿到
        "email": "xxx",		# 你的email
        "icode": code_data,
        "origURL": "http://www.renren.com/home",
        "domain": "renren.com",
        "key_id": "1",
        "captcha_type": "web_login",
        "password": "xxx",	# 你的password密文
        "rkey": "xxx",
        "f": "http%3A%2F%2Fwww.renren.com%2F970153909"
    }
    
    # 创建session对象
    session = requests.Session()
    # 使用session进行请求的发送: 获取cookie,且将cookie保存到session中
    session.post(url=login_url, data=data, headers=headers)
    # 指定个人主页对应的页面url
    url = 'http://www.renren.com/970153909/profile'
    # 携带session发送该请求,并获取响应数据
    page_text = session.get(url=url, headers=headers).text
    # 持久化存储
    with open('renren.html', 'w', encoding='utf8') as f:
        f.write(page_text)
    

    九.对古诗文网进行模拟登录

    """
    cookie的处理:
        1. 手动处理:
            - 把cookie封装到headers中
        2. 自动处理:
            - (1)获取一个session对象
            - (2)使用session对象进行请求的发送
            - (3)作用: 在使用session进行请求发送的过程中,
                    如果产生了cookie,cookie就会被自动存储到session对象中.
    """
    
    
    from ydmhttp import getCodeData # 识别人人网中的验证码图片
    from urllib import request
    from lxml import etree
    import requests, os, uuid
    
    # 创建资源文件存放目录
    if not os.path.exists('./sources'):
        os.mkdir('./sources')
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    # 实例化session对象
    session = requests.Session()
    
    # 指定登陆页面的url
    url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
    # 获取登陆页面的HTML文本
    page_text = requests.get(url=url, headers=headers).text
    # 创建etree对象
    tree = etree.HTML(page_text)
    
    # 获取登陆页面的识别验证码
    code_img_url = 'https://so.gushiwen.org' + tree.xpath('//*[@id="imgCode"]/@src')[0]
    # 持久化存储 验证码图片
    filepath = f'./sources/{uuid.uuid4()}'
    filename = filepath + '.jpg'
    # 注意:验证码图片必须用携带session去获取,否则获取的验证码无法该账户相匹配
    img_data = session.get(url=code_img_url, headers=headers).content
    with open(filename, 'wb') as fp:
        fp.write(img_data)
    
    # 识别验证码图片中的数据. 验证码类型查询: http://www.yundama.com/price.html
    # 这里应该填写你云打码平台 普通用户的用户名和密码,而不是开发者用户; 1004是验证码类型, 50是延迟时间
    code_data = getCodeData('username', 'password', filename, 1004, 50)
    
    # 指定登录请求的url
    login_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
    
    # 登录该网站时需要在请求头中加入动态参数
    __VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
    __VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
    data = {
        "__VIEWSTATE": __VIEWSTATE,
        "__VIEWSTATEGENERATOR": __VIEWSTATEGENERATOR,
        "from": "http://so.gushiwen.org/user/collect.aspx",
        "email": "xxx",		# 你的email
        "pwd": "xxx",		# 你的密码
        "code": code_data,
        "denglu": "登录",
    }
    
    # 模拟登陆,拿到登录后的首页数据
    index_text = session.post(url=login_url, data=data, headers=headers).content
    
    # 持久化存储
    filename2 = filepath + '.html'
    with open(filename2, 'wb') as f:
        f.write(index_text)
    
    print('下载成功!!!')
    

    十.使用线程池爬取梨视频的视频数据

    import requests, re, os
    from lxml import etree
    from uuid import uuid4
    
    # 导入线程池模块
    from multiprocessing.dummy import Pool
    
    # 在线程池中创建10个线程
    pool = Pool(10)
    """
    线程池的使用场景: 应用在所有耗时的操作中
    """
    
    # 创建资源文件存放目录
    if not os.path.exists('./sources'):
        os.mkdir('./sources')
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    
    url = 'https://www.pearvideo.com/category_1'
    page_text = requests.get(url=url, headers=headers).text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
    video_url_list = []  # 装的是所有视频链接地址
    for li in li_list:
        detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
        print(detail_url)
        detail_page_text = requests.get(url=detail_url, headers=headers).text
        # 我们发现,视频的链接地址在JS中,无法通过xpath取到视频链接地址,于是采用正则匹配
        video_url = re.findall('ldUrl="",srcUrl="(.*?)",vdoUrl=srcUrl', detail_page_text, re.S)[0]
        video_url_list.append(video_url)
    print(video_url_list)
    
    
    def getVideoData(url):
        video_data = requests.get(url=url, headers=headers).content
        return video_data
    
    
    def saveVideoData(data):
        filename = f'./sources/{uuid4()}.mp4'
        with open(filename, 'wb') as f:
            f.write(data)
        print('下载成功!')
    
    
    # 对视频的链接发起请求并获取视频数据
    # video_data_list存储所有视频的二进制数据
    video_data_list = pool.map(getVideoData, video_url_list)
    
    # 使用线程池对视频数据进行持久化存储
    pool.map(saveVideoData, video_data_list)
    

    十一.selenium的使用

    博客地址: https://www.cnblogs.com/bobo-zhang/p/9685362.html
    
    谷歌浏览器驱动下载地址: http://chromedriver.storage.googleapis.com/index.html
    
    下载的驱动程序必须和浏览器的版本统一,大家可以根据http://blog.csdn.net/huilan_same/article/details/51896672中提供的版本映射表进行对应
    
    PhantomJS下载地址: https://pan.baidu.com/s/11KMIKitILGpVU33oxxzcJA  # 提取码:og8o 
    

    1.百度文本输入框中录入中国

    """
    selenium: 可以让浏览器完成相关自动化的操作
    环境安装:
        - pip install selenium
    编码流程:
        - 导包
        - 创建某一款浏览器对象
        - 制定相关的行为动作
    """
    
    from selenium import webdriver
    import time, os
    
    if not os.path.exists('./sources'):
        os.mkdir('./sources')
    
    # 配置浏览器
    browser = webdriver.Chrome(executable_path=r'F:chromedriver.exe')
    time.sleep(3)
    
    browser.get('https://www.baidu.com/')
    time.sleep(3)
    
    # find系列的函数可以帮助我们定位到相关的标签
    text_input = browser.find_element_by_id('kw')
    # 向文本框录入一个关键字
    text_input.send_keys('中国')
    time.sleep(3)
    
    btn = browser.find_element_by_id('su')
    btn.click()
    time.sleep(3)
    
    # 获取当前浏览器显示的页面源码数据(动态加载的数据)
    page_text = browser.page_source
    
    # 持久化存储
    with open('./sources/zhongguo.html', 'w', encoding='utf-8') as fp:
        fp.write(page_text)
    
    time.sleep(3)
    
    browser.quit()
    

    2.爬取更多的电影详情数据(豆瓣)

    from selenium import webdriver
    import time, os
    
    if not os.path.exists('./sources'):
        os.mkdir('./sources')
    
    # 配置浏览器
    browser = webdriver.Chrome(executable_path=r'F:chromedriver.exe')
    
    browser.get('https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=')
    time.sleep(3)
    
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(3)
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(3)
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(3)
    
    # 获取浏览器当前的页面源码数据
    page_text = browser.page_source
    
    # 持久化存储
    with open('./sources/douban.html', 'w', encoding='utf-8') as fp:
        fp.write(page_text)
    
    time.sleep(3)
    
    browser.quit()
    

    3.使用phantomJs浏览器爬取更多的电影详情数据(豆瓣)

    from selenium import webdriver
    import time, os
    
    if not os.path.exists('./sources'):
        os.mkdir('./sources')
    
    # 配置PhantomJS浏览器
    browser = webdriver.PhantomJS(
        executable_path=r'F:phantomjs-2.1.1-windowsinphantomjs.exe')
    
    browser.get('https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=')
    time.sleep(3)
    
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(3)
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(3)
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(3)
    
    # 获取浏览器当前的页面源码数据
    page_text = browser.page_source
    
    # 持久化存储
    with open('./sources/douban2.html', 'w', encoding='utf-8') as fp:
        fp.write(page_text)
    
    time.sleep(3)
    
    browser.quit()
    

    4.使用谷歌无头浏览器爬取数据

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from time import sleep
    
    # 必须写上这三个配置项
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    # 指定chrome_options
    bro = webdriver.Chrome(executable_path=r'C:Userschromedriver.exe', chrome_options=chrome_options)
    sleep(3)
    
    bro.get('https://www.baidu.com/')
    sleep(3)
    
    # find系列的函数可以帮助我们定位到相关的标签
    text_input = bro.find_element_by_id('kw')
    # 向文本框中录入一个关键字
    text_input.send_keys('中国')
    sleep(3)
    
    btn = bro.find_element_by_id('su')
    btn.click()
    sleep(3)
    
    # 获取当前浏览器显示的页面源码数据(动态加载的数据)
    page_text = bro.page_source
    print(page_text)
    
    bro.quit()
    

    5.登录qq空间并爬取首页的数据

    from selenium import webdriver
    from lxml import etree
    import time, os
    
    if not os.path.exists('./sources'):
        os.mkdir('./sources')
    
    # 配置浏览器
    browser = webdriver.Chrome(executable_path=r'F:chromedriver.exe')
    
    # 发送请求
    browser.get('https://qzone.qq.com/')
    time.sleep(5)
    
    """
    在web应用中经常会遇到frame嵌套页面的应用,
    使用WebDriver每次只能在一个页面上识别元素,
    对于frame嵌套内的页面上的元素,
    直接定位是定位不到的.
    这个时候就需要通过switch_to.frame()方法
    将当前定位的主体切换到frame里
    """
    
    # 定位到id="login_frame"的iframe标签下的所有元素
    browser.switch_to.frame('login_frame')
    
    # 点击id="switcher_plogin"的标签
    browser.find_element_by_id('switcher_plogin').click()
    time.sleep(1)
    
    # 给id="u"的标签设置值
    browser.find_element_by_id("u").send_keys("username")	# 你的用户名
    # time.sleep(1)
    
    # 给id="p"的标签设置值
    browser.find_element_by_id("p").send_keys("password")	# 你的密码
    # time.sleep(3)
    
    # 点击id="login_button"的标签设置值
    browser.find_element_by_id("login_button").click()
    time.sleep(1)
    
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(1)
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(1)
    
    # 获取页面所有数据
    page_text = browser.page_source
    
    # 持久化存储
    with open('./sources/QQzone.html', 'w', encoding='utf-8') as fp:
        fp.write(page_text)
    
    tree = etree.HTML(page_text)
    
    li_list = tree.xpath('//*[@id="feed_friend_list"]/li')
    
    for li in li_list:
        text_list = li.xpath('.//div[@class="f-info"]//text() | .//div[@class="f-info qz_info_cut"]//text()')
        text = ''.join(text_list)
        print(text + '
    
    
    ')
    
    browser.close()
    
  • 相关阅读:
    VS 对话框控件的Tab顺序问题
    基于OpenGL三维软件开发
    OpenGL 中的三维纹理操作
    VC 在桌面上绘制一些图形
    VC/MFC如何添加启动界面
    Cordova or Xamarin 用.net开发IOS和Android程序
    ASP.NET Web API
    软件项目如何选型
    CIO的职责、条件及价值
    Oracle日期周详解IW
  • 原文地址:https://www.cnblogs.com/haitaoli/p/10633237.html
Copyright © 2011-2022 走看看