zoukankan      html  css  js  c++  java
  • 2018-8-10爬虫第四天内容

    2018-8-10爬虫第四天内容

    • 豆瓣电影英剧和美剧的爬取(作业)
    import json
    
    import requests
    
    
    class GetDouBanMovies(object):
        __instance = None
    
        def __new__(cls, *args, **kwargs):
            if cls.__instance is None:
                cls.__instance = super().__new__(cls)
                return cls.__instance
            return cls.__instance
    
        def __init__(self):
            self.temp_urls = [
                {
                    "url": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?os=ios&for_mobile=1&start={}&count=18&loc_id=108288&_=1533811595869",
                    "Referer": "https://m.douban.com/tv/british"
                },
                {
                    "url": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?os=ios&for_mobile=1&start={}&count=18&loc_id=108288&_=1533818802677",
                    "Referer": "https://m.douban.com/tv/american"
                }
            ]
            # self.url = url
            self.headers = {
                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko)"
                              " Version/11.0 Mobile/15A372 Safari/604.1",
                "Referer": "https://m.douban.com/tv/british"
            }
    
        def __prase_url(self, url):
            response = requests.get(url, headers=self.headers, timeout=5)
            assert response.status_code == 200
            html_str = response.content.decode()
            ret = json.loads(html_str)
            self.subject_collection_items = ret.get("subject_collection_items")
            return self.subject_collection_items
    
        def prase_url(self, url):
            try:
                movies_lists = self.__prase_url(url)
            except Exception as e:
                movies_lists = None
            return movies_lists
    
        def write_str(self, movies_lists):
            for temp_movie in movies_lists:
                with open("./british2/read.txt", "a", encoding="utf-8") as f:
                    f.write(json.dumps(temp_movie, ensure_ascii=False))
                    f.write("
    
    ")
    
        def run(self):
            for next_url in self.temp_urls:
                start_num = 0
                self.headers["Referer"] = next_url["Referer"]
                while True:
                    url = next_url["url"].format(start_num)
                    print(url)
                    movies_lists = self.prase_url(url)
                    self.write_str(movies_lists)
                    if len(self.subject_collection_items) != 18:
                        break
                    start_num += 18
    
    
    if __name__ == "__main__":
        s1 = GetDouBanMovies()
        s1.run()
    
    
    • 正则表达式
    • re模块的三种方法
      • re.findall() # 返回类表
      • re.sub() # 返回字符串
      • re.compile() # 编译提交匹配效率
    • 普通模式
    string_a = '<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    		<meta http-equiv="content-type" content="text/html;charset=utf-8">
    		<meta content="always" name="referrer">
            <meta name="theme-color" content="#2932e1">'
    ret = re.findall(r'<.*>',string_a)
    print(ret)
    
    • 执行结果、
    ['<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">', '<meta http-equiv="content-type" content="text/html;charset=utf-8">', '<meta content="always" name="referrer">', '<meta name="theme-color" content="#2932e1">']
    
    • re.s模式
    string_a = '<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    		<meta http-equiv="content-type" content="text/html;charset=utf-8">
    		<meta content="always" name="referrer">
            <meta name="theme-color" content="#2932e1">'
    ret = re.findall(r'<.*>',string_a,re.S)
    print(ret)
    
    • 执行结果
    ['<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
    		<meta http-equiv="content-type" content="text/html;charset=utf-8">
    		<meta content="always" name="referrer">
            <meta name="theme-color" content="#2932e1">']
    
    • Python中的原始字符串

      • 原始字符串r的操作
      • 相对一特殊符号而言,表示特殊符号的字面的意思
    • xpath对html页面的处理

    from lxml import etree
    
    text = """
    <div> <ul>
            <li class="item-1"><a href="link1.html">first item</a></li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-inactive"><a href="link3.html">third item</a></li>
            <li class="item-1"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a>
            </ul> </div>
    
    """
    html = etree.HTML(text)
    
    li_lists = html.xpath("//li")  # 获得的是[<Element li at 0x7fc520c7d088>,]的li对象列表
    item = {}
    for li_list in li_lists:
        key = li_list.xpath("./a/@href")[0] if li_list.xpath("./a/@href") else None # 取href中的属性的值
        value = li_list.xpath("./a/text()")[0] if li_list.xpath("./a/text()") else None  # 获取所有的值
        item[key] = value
    print(item)
    handeled_html_str = etree.tostring(html).decode()  # 将Element对象转化为字符串
    print(handeled_html_str)
    
    • 总结:

      • etree.HTML(text)会对传入的字符串进行转化,补充符合规范的的HTML的语法;
      • etree.tostring(html).decode()的方法中将Element转化为字符串;
    • 果壳网中数据

    import json
    from pprint import pprint
    
    import re
    import requests
    
    
    class GetNews(object):
        __instance = None
    
        def __new__(cls, *args, **kwargs):
            if cls.__instance is None:
                cls.__instance = super().__new__(cls)
                return cls.__instance
            return cls.__instance
    
        def __init__(self):
            self.headers = {
                "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko)"
                              " Version/11.0 Mobile/15A372 Safari/604.1",
            }
    
        def __prase_url(self, url):
            response = requests.get(url, headers=self.headers, timeout=5)
            assert response.status_code == 200
            html_str = response.content.decode()
            news_lists = re.findall(r"""<h2><a target="_blank" href="(.*?)">(.*?)</a></h2>""", html_str)
            return news_lists
    
        def prase_url(self, url):
            try:
                news_lists = self.__prase_url(url)
            except Exception as e:
                news_lists = None
            return news_lists
    
        def write_str(self, movies_lists):
            for temp_movie in movies_lists:
                with open("./news/read.txt", "a", encoding="utf-8") as f:
                    f.write(json.dumps(temp_movie, ensure_ascii=False))
                    f.write("
    
    ")
    
        def run(self):
            page = 0
            # while True:
            #     print(page)
            #     url = "https://www.guokr.com/ask/highlight/?page={}".format(page)
            #     print(url)
            #     news_lists = self.prase_url(url)
            #     self.write_str(news_lists)
            #     page += 1
            #     if page >100:
            #         break
    
            for page in range(1,5):
                print(page)
                url = "https://www.guokr.com/ask/highlight/?page={}".format(page)
                print(url)
                news_lists = self.prase_url(url)
                self.write_str(news_lists)
    
    
    if __name__ == "__main__":
        n1 = GetNews()
        n1.run()
    
  • 相关阅读:
    JS一些概念知识及参考链接
    CSS中浮动属性float及清除浮动
    前端一些概念知识及参考链接
    CSS中属性百分比的基准点
    CSS中的单位
    css居中问题
    Vue使用的扩展
    vue使用中的问题总结
    CSS中的position属性
    CSS中的flex布局
  • 原文地址:https://www.cnblogs.com/cerofang/p/9457067.html
Copyright © 2011-2022 走看看