zoukankan      html  css  js  c++  java
  • 爬虫基础-2

    寻找登录的post地址
      - 在form表单中寻找action对应的url地址
      - post的数据是input标签中name的值作为键,真正的用户名密码作为值的字典,post的url地址就是action对应的url地址

      - 抓包,寻找登录的url地址
      - 勾选perserve log按钮,防止页面跳转找不到url
      - 寻找post数据,确定参数
      - 参数不会变,直接用,比如密码不是动态加密的时候
      - 参数会变
      - 参数在当前的响应中
      - 通过js生成 

    定位想要的js
      - 选择会触发js时间的按钮,点击event listener,找到js的位置
      - 通过chrome中的search all file来搜索url中关键字
      - 添加断点的方式来查看js的操作,通过python来进行同样的操作

    安装第三方模块
      - pip install retrying
      - 下载源码解码,进入解压后的目录,```python setup.py install```
      - `***.whl` 安装方法 `pip install ***.whl`

    json使用注意点
      - json中的字符串都是双引号引起来的
      - 如果不是双引号
      - eval:能实现简单的字符串和python类型的转化
      - replace:把单引号替换为双引号
      - 往一个文件中写入多个json串,不再是一个json串,不能直接读取
      - 一行写一个json串,按照行来读取

    # coding=utf-8
    import requests
    import json
    import sys
    
    class BaiduFanyi:
        def __init__(self,trans_str):
            self.trans_str = trans_str
            self.lang_detect_url = "http://fanyi.baidu.com/langdetect"
            self.trans_url = "http://fanyi.baidu.com/basetrans"
            self.headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"}
    
        def parse_url(self,url,data): #发送post请求,获取响应
            response = requests.post(url,data=data,headers=self.headers)
            return json.loads(response.content.decode())
    
        def get_ret(self,dict_response):#提取翻译的结果
            ret = dict_response["trans"][0]["dst"]
            print("result is :",ret)
    
    
        def run(self):#实现主要逻辑
            #1.获取语言类型
                #1.1 准备post的url地址,post_data
            lang_detect_data = {"query":self.trans_str}
                #1.2 发送post请求,获取响应
            lang = self.parse_url(self.lang_detect_url,lang_detect_data)["lan"]
                #1.3 提取语言类型
            #2.准备post的数据
            trans_data = {"query":self.trans_str,"from":"zh","to":"en"} if lang== "zh" else 
                {"query":self.trans_str,"from":"en","to":"zh"}
            #3.发送请求,获取响应
            dict_response = self.parse_url(self.trans_url,trans_data)
            #4.提取翻译的结果
            self.get_ret(dict_response)
    
    
    if __name__ == '__main__':
        trans_str= sys.argv[1]
        baidu_fanyi = BaiduFanyi(trans_str)
        baidu_fanyi.run()
    View Code
    # coding=utf-8
    import json
    import requests
    from parse_url import parse_url
    from pprint import pprint
    
    url = "https://m.douban.com/rexxar/api/v2/subject_collection/movie_showing/items?start=0&count=18&loc_id=108288"
    html_str = parse_url(url)
    
    # json.loads把json字符串转化为python类型
    ret1 = json.loads(html_str)
    # pprint(ret1)
    # print(type(ret1))
    
    # json.dumps能够把python类型转化为json字符串
    with open("douban.json","w",encoding="utf-8") as f:
        f.write(json.dumps(ret1,ensure_ascii=False,indent=4))
        # f.write(str(ret1))
    
    # with open("douban.json","r",encoding="utf-8") as f:
    #     ret2 = f.read()
    #     ret3 = json.loads(ret2)
    #     print(ret3)
    #     print(type(ret3))
    
    
    # 使用json。load提取类文件对象中的数据
    with open("douban.json","r",encoding="utf-8") as f:
        ret4 = json.load(f)
        print(ret4)
        print(type(ret4))
    
    #json.dump能够把python类型放入类文件对象中
    with open("douban1.json","w",encoding="utf-8") as f:
        json.dump(ret1,f,ensure_ascii=False,indent=2)
    View Code
    # coding=utf-8
    import re
    from parse_url import parse_url
    import json
    
    url = "http://36kr.com/"
    html_str = parse_url(url)
    
    ret = re.findall("<script>var props=(.*?),locationnal=",html_str)[0]
    
    with open("36kr.json","w",encoding="utf-8") as f:
        f.write(ret)
    
    ret = json.loads(ret)
    print(ret)
    View Code
    import requests,sys,json
    
    class Automatic():
        def __init__(self,translade_word):
            self.translade_word = translade_word
            self.langdetect_headers ={"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36"}
            self.langdetect_parpams = {"query": translade_word}
            self.langdetect_url ="http://fanyi.baidu.com/langdetect"
            self.translated_url = "http://fanyi.baidu.com/basetrans"
    
        def langdetect(self):
            response = requests.post(self.langdetect_url,headers = self.langdetect_headers,data=self.langdetect_parpams)
            return json.loads(response.content.decode())['lan']
    
        def get_data_language(self,language_word):
            #
            # if "zh" ==language_word:
            #     translade_data ={"query":self.translade_word,
            #                         "from":"zh",
            #                         "to":"en"}
            # else:
            #     translade_data = {"query": self.translade_word,
            #                       "from": language_word,
            #                       "to": "zh"}
            return {"query":self.translade_word,"from":"zh","to":"en"} if "zh" ==language_word 
                    else {"query": self.translade_word,"from": language_word,"to": "zh"}
    
        def translade(self,translade_data):
            response = requests.post(self.translated_url,data=translade_data,headers = self.langdetect_headers)
            response_data = json.loads(response.text)
            # print("1111111111",response_data)
            return response_data
    
        def get_ret(self,response_data):
            data = response_data["trans"][0]["dst"]
            print("{}  翻译后的结果:{}".format(self.translade_word, data))
    
    
    
        def run(self):
            language_word = self.langdetect()
            translade_data= self.get_data_language(language_word)
            response_data = self.translade(translade_data)
            self.get_ret(response_data)
    
    if __name__ == '__main__':
        translade_word = sys.argv[1]
        automatic = Automatic(translade_word)
        automatic.run()
    View Code
    # coding=utf-8
    import requests
    from retrying import retry
    
    headers={"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
    
    @retry(stop_max_attempt_number=3)
    def _parse_url(url,method,data,proxies):
        print("*"*20)
        if method=="POST":
            response = requests.post(url,data=data,headers=headers,proxies=proxies)
        else:
            response = requests.get(url,headers=headers,timeout=3,proxies=proxies)
        assert  response.status_code == 200
        return response.content.decode()
    
    
    def parse_url(url,method="GET",data=None,proxies={}):
        try:
            html_str = _parse_url(url,method,data,proxies)
        except:
            html_str = None
    
        return html_str
    
    if __name__ == '__main__':
        url = "www.baidu.com"
        print(parse_url(url))
    View Code

    正则使用的注意点
      - `re.findall("a(.*?)b","str")`,能够返回括号中的内容,括号前后的内容起到定位和过滤的效果

      - 原始字符串r,待匹配字符串中有反斜杠的时候,使用r能够忽视反斜杠带来的转义的效果

      - 点号默认情况匹配不到` `

      - `s`能够匹配空白字符,不仅仅包含空格,还有` | `


    xpath学习重点
      - 使用xpath helper或者是chrome中的copy xpath都是从element中提取的数据,但是爬虫获取的是url对应的响应,往往和elements不一样
      - 获取文本
      - `a/text()` 获取a下的文本
      - `a//text()` 获取a下的所有标签的文本
      - `//a[text()='下一页']` 选择文本为下一页三个字的a标签

      - `@符号`
      - `a/@href`
      - `//ul[@id="detail-list"]`

      - `//`
      - 在xpath最前面表示从当前html中任意位置开始选择
      - `li//a` 表示的是li下任何一个标签

    lxml使用注意点
      - lxml能够修正HTML代码,但是可能会改错了
      - 使用etree.tostring观察修改之后的html的样子,根据修改之后的html字符串写xpath

      - lxml 能够接受bytes和str的字符串

      - 提取页面数据的思路
      - 先分组,渠道一个包含分组标签的列表
      - 遍历,取其中每一组进行数据的提取,不会造成数据的对应错乱

    # coding=utf-8
    import requests
    import json
    
    
    class DoubanSpider:
        def __init__(self):
            self.url_temp_list = [
                {
                    "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288",
                    "country": "US"
                },
                {
                    "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?start={}&count=18&loc_id=108288",
                    "country": "UK"
                },
                {
                    "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288",
                    "country": "CN"
                }
            ]
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"}
    
        def parse_url(self, url):  # 发送请求,获取响应
            print(url)
            response = requests.get(url, headers=self.headers)
            return response.content.decode()
    
        def get_content_list(self, json_str):  # 提取是数据
            dict_ret = json.loads(json_str)
            content_list = dict_ret["subject_collection_items"]
            total = dict_ret["total"]
            return content_list, total
    
        def save_content_list(self, content_list,country):  # 保存
            with open("douban.txt", "a", encoding="utf-8") as f:
                for content in content_list:
                    content["country"] = country
                    f.write(json.dumps(content, ensure_ascii=False))
                    f.write("
    ")  # 写入换行符,进行换行
            print("保存成功")
    
        def run(self):  # 实现主要逻辑
            for url_temp in self.url_temp_list:
                num = 0
                total = 100  # 假设有第一页
                while num < total + 18:
                    # 1.start_url
                    url = url_temp["url_temp"].format(num)
                    # 2.发送请求,获取响应
                    json_str = self.parse_url(url)
                    # 3.提取是数据
                    content_list, total = self.get_content_list(json_str)
    
                    # 4.保存
                    self.save_content_list(content_list,url_temp["country"])
                    # if len(content_list)<18:
                    #     break
                    # 5.构造下一页的url地址,进入循环
                    num += 18
    
    
    if __name__ == '__main__':
        douban_spider = DoubanSpider()
        douban_spider.run()
    View Code
    # coding=utf-8
    import requests
    import re
    import json
    
    
    class Neihan:
        def __init__(self):
            self.start_url = "http://neihanshequ.com/"
            self.next_url_temp = "http://neihanshequ.com/joke/?is_json=1&app_name=neihanshequ_web&max_time={}"
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
    
        def parse_url(self, url):  # 发送请求
            print(url)
            response = requests.get(url, headers=self.headers)
            return response.content.decode()
    
        def get_first_page_content_list(self, html_str):  # 提取第一页的数据
            content_list = re.findall(r"<h1 class="title">.*?<p>(.*?)</p>", html_str, re.S)
            max_time = re.findall("max_time: '(.*?)',", html_str)[0]
            return content_list, max_time
    
        def save_content_list(self, content_list):  # 保存
            with open("neihan.txt", "a", encoding="utf-8") as f:
                for content in content_list:
                    f.write(json.dumps(content, ensure_ascii=False))
                    f.write("
    ")
            print("保存成功")
    
        def get_content_list(self, json_str):  # 提取从第二页开始的json中的数据
            dict_ret = json.loads(json_str)
            data = dict_ret["data"]["data"]
            content_list = [i["group"]["content"] for i in data]
            max_time = dict_ret["data"]["max_time"]
            has_more = dict_ret["data"]["has_more"]
            return content_list, max_time, has_more
    
        def run(self):  # 实现主要逻辑
            # 1.start_url
            # 2.发送请求,获取响应
            html_str = self.parse_url(self.start_url)
            # 3.提取数据
            content_lsit, max_time = self.get_first_page_content_list(html_str)
            # 4.保存
            self.save_content_list(content_lsit)
            has_more = True  # 有第二页
            while has_more:  # 内涵社区是用has_more 来判断是否有下一页的
                # 5.构造下一页的url地址
                next_url = self.next_url_temp.format(max_time)
                # 6.发送请求,获取响应
                json_str = self.parse_url(next_url)
                # 7.提取数据,提取max_time
                content_lsit, max_time, has_more = self.get_content_list(json_str)
                # 8.保存
                self.save_content_list(content_lsit)
                # 9.循环5-8步
    
    
    if __name__ == '__main__':
        neihan = Neihan()
        neihan.run()
    View Code
    # coding=utf-8
    from lxml import etree
    
    
    text = ''' <div> <ul> 
            <li class="item-1"><a>first item</a></li> 
            <li class="item-1"><a href="link2.html">second item</a></li> 
            <li class="item-inactive"><a href="link3.html">third item</a></li> 
            <li class="item-1"><a href="link4.html">fourth item</a></li> 
            <li class="item-0"><a href="link5.html">fifth item</a>  
            </ul> </div> '''
    
    html = etree.HTML(text)
    print(html)
    #查看element对象中包含的字符串
    # print(etree.tostring(html).decode())
    
    #获取class为item-1 li下的a的herf
    ret1 = html.xpath("//li[@class='item-1']/a/@href")
    print(ret1)
    
    #获取class为item-1 li下的a的文本
    ret2 = html.xpath("//li[@class='item-1']/a/text()")
    print(ret2)
    
    #每个li是一条新闻,把url和文本组成字典
    for href in ret1:
        item = {}
        item["href"] = href
        item["title"] = ret2[ret1.index(href)]
        print(item)
    
    print("*"*100)
    #分组,根据li标签进行分组,对每一组继续写xpath
    ret3 = html.xpath("//li[@class='item-1']")
    print(ret3)
    for i in ret3:
        item=  {}
        item["title"] = i.xpath("a/text()")[0] if len(i.xpath("./a/text()"))>0 else None
        item["href"] = i.xpath("./a/@href")[0] if len( i.xpath("./a/@href"))>0 else None
        print(item)
    View Code
    #  _*_coding: utf-8 _*_
    import json
    import requests
    from parse_url import parse_url
    import sys
    from pprint import pprint
    import re
    
    
    class douban:
        def __init__(self, url):
            self.url = url
            self.L_url = []
            self.start = 0
            self.html_str = ""
            self.ret = {}
    
        def get_total(self):
            html_str = parse_url(self.url)
            # json.loads把json字符串转化为python类型
            ret1 = json.loads(html_str)
            total = ret1["total"]
            return total
    
        def get_url(self, total):
            while self.start < total + 50:
                url = self.url.format(
                    self.start + 1, 50)
                self.L_url.append(url)
                self.start += 50
    
        def get_name(self):
            Wurl = self.url
            reg = r'https://m.douban.com/rexxar/api/v2/subject_collection/(.*?)/'
            name = re.findall(reg, Wurl)
            return name[0] + ".json"
    
        def data(self, name):
            for url in self.L_url:
                self.html_str = parse_url(url)
                ret = json.loads(self.html_str)
                with open(name, "a", encoding="utf-8") as f:
                    f.write(json.dumps(ret, ensure_ascii=False, indent=4))
    
        def run(self):
            total = self.get_total()
            self.get_url(total=total)
            name = self.get_name()
            self.data(name=name)
    
    
    if __name__ == '__main__':
        url_dict = {
            "美国": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?os=android&start=0&count=18&loc_id=108288",
            "英国": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?os=android&start=0&count=18&loc_id=108288",
            "韩国": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_korean_drama_hot/items?os=android&start=0&count=18&loc_id=108288",
            "中国": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?os=android&start=0&count=18&loc_id=108288"
        }
        Len = len(sys.argv)
        for i in range(Len - 1):
            url_name = sys.argv[i + 1]
            url = url_dict[url_name]
            print(url)
            douban = douban(url)
            douban.run()
    View Code
  • 相关阅读:
    《程序员修炼之道》阅读笔记2
    《程序员修炼之道》阅读笔记1
    Ubuntu16桥接模式上网并设置静态ip
    读《架构漫谈》有感
    质量属性6个常见属性的场景分析
    sql注水
    python版本切换
    使用vue-cli构建 webpack打包工具时,生产环境下,每次build时,删除dist目录,并重新生成,以防dist目录文件越来越多。
    Java栈与堆
    从一个字符串s的第i个字符(不包括此字符)开始删除n个字符
  • 原文地址:https://www.cnblogs.com/MR-allen/p/10584063.html
Copyright © 2011-2022 走看看