zoukankan      html  css  js  c++  java
  • 待完善代码记录

      1 # coding = utf-8
      2 
      3 __autor__ = 'litao'
      4 
      5 import random, requests
      6 import logging
      7 import traceback
      8 import time
      9 import re
     10 from lxml import etree
     11 from logMaker import Logger
     12 
     13 USER_AGENTS = [
     14     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
     15     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
     16     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
     17     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
     18     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
     19     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
     20     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
     21     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
     22     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
     23     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
     24     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
     25     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
     26     "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
     27     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
     28     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
     29     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
     30 ]
     31 
     32 headers = {
     33     "User-Agent": random.choice(USER_AGENTS)
     34 }
     35 
     36 log = Logger('spider.log', logging.ERROR, logging.DEBUG)
     37 Type = re.compile(r'subtype":[(.*?)],')
     38 
     39 
     40 def get_html(url, agent=True):
     41     """
     42     获取url对应的网页的源码
     43     :param url: url
     44     :return: html
     45     """
     46     try:
     47         html = requests.get(url=url, headers=headers, timeout=10).text
     48         log.debug("url: %s请求成功!" % url)
     49     except Exception:
     50         if agent == True:
     51             log.error("url: %s访问出错,出错原因如下即将重试" % url)
     52             log.error(traceback.print_exc())
     53             get_html(url, agent=False)
     54         if agent == False:
     55             html = None
     56     time.sleep(random.randint(5,9))#random.randint(5,15),random.randint(16,35)
     57     return html
     58 
     59 
     60 def get_info(url_and_title_list):
     61     """
     62     获取所需信息
     63     :param url_and_title_list: 本次需要抓取内容的url列表
     64     :return: None
     65     """
     66     try:
     67             for url, title in url_and_title_list:
     68                 html = get_html(url)
     69                 selector = etree.HTML(html)
     70                 print(selector.xpath("//a[@class='tag_item']/text()")[2:])
     71                 try:
     72                     type = re.findall(Type, html)[0]
     73                 except Exception as e:
     74                     log.error("%s电视剧分为为空!" % title)
     75                     type = ''
     76                 content = url + ',' + title + ',' + type
     77                 print(content)
     78                 with open('result.txt', 'a', encoding='gbk') as f:
     79                     f.writelines(content+'
    ')
     80                 log.debug("向文件中写入: %s" % content)
     81     except Exception:
     82         log.error("获取信息过程中出错,出错原因如下")
     83         log.error(traceback.print_exc())
     84 
     85 
     86 def get_all_url():
     87     """
     88     获取所有电视剧下所有url
     89     :return: None
     90     """
     91     page_number_list = random.sample(range(117), 117)
     92     for page_number in page_number_list:
     93         log.debug("开始爬去电视剧下第%d页内容" % page_number)
     94         url = 'http://v.qq.com/x/list/tv?sort=18&iarea=-1&offset={offset}'.format(offset=page_number * 30)
     95         html = get_html(url)
     96         if html != None:
     97             seletor = etree.HTML(html)
     98             child_url_list = seletor.xpath("//ul[@class='figures_list']/li/a/@href")
     99             name = seletor.xpath("//strong[@class='figure_title']/a/@title")
    100             get_info(zip(child_url_list, name))
    101 
    102 
    103 if __name__ == "__main__":
    104     get_all_url()
    105     with open('result.txt', 'a', encoding='utf-8') as f:
    106         f.writelines('https://v.qq.com/x/cover/5tjct4561pq7zan.html,热剧精彩周边,"创意剪辑"')
     1 # coding = utf-8
     2 
     3 __autor__ = 'litao'
     4 
     5 import logging
     6 
     7 
     8 class Logger:
     9     def __init__(self, path, clevel=logging.DEBUG, Flevel=logging.DEBUG):
    10         self.logger = logging.getLogger(path)
    11         self.logger.setLevel(logging.DEBUG)
    12         fmt = logging.Formatter('[%(asctime)s] [%(levelname)s] %(message)s', '%Y-%m-%d %H:%M:%S')
    13         # 设置CMD日志
    14         sh = logging.StreamHandler()
    15         sh.setFormatter(fmt)
    16         sh.setLevel(clevel)
    17         # 设置文件日志
    18         fh = logging.FileHandler(path)
    19         fh.setFormatter(fmt)
    20         fh.setLevel(Flevel)
    21         self.logger.addHandler(sh)
    22         self.logger.addHandler(fh)
    23 
    24     def debug(self, message):
    25         self.logger.debug(message)
    26 
    27     def info(self, message):
    28         self.logger.info(message)
    29 
    30     def war(self, message):
    31         self.logger.warn(message)
    32 
    33     def error(self, message):
    34         self.logger.error(message)
    35 
    36     def cri(self, message):
    37         self.logger.critical(message)
    38 
    39 
    40 if __name__ == '__main__':
    41     logyyx = Logger('yyx.log', logging.ERROR, logging.DEBUG)
    42     logyyx.debug('一个debug信息')
    43     logyyx.info('一个info信息')
    44     logyyx.war('一个warning信息')
    45     logyyx.error('一个error信息')
    46     logyyx.cri('一个致命critical信息')
    # coding = utf-8

    __autor__ = 'litao'

    import random, requests
    import logging
    import traceback
    import time
    import re
    from lxml import etree
    from logMaker import Logger

    USER_AGENTS = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
    "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
    "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
    ]

    headers = {
    "User-Agent": random.choice(USER_AGENTS)
    }

    log = Logger('spider.log', logging.ERROR, logging.DEBUG)
    Type = re.compile(r'subtype":[(.*?)],')


    def get_html(url, agent=True):
    """
    获取url对应的网页的源码
    :param url: url
    :return: html
    """
    try:
    html = requests.get(url=url, headers=headers, timeout=10).text
    log.debug("url: %s请求成功!" % url)
    except Exception:
    if agent == True:
    log.error("url: %s访问出错,出错原因如下即将重试" % url)
    log.error(traceback.print_exc())
    get_html(url, agent=False)
    if agent == False:
    html = None
    time.sleep(random.randint(5,9))#random.randint(5,15),random.randint(16,35)
    return html


    def get_info(url_and_title_list):
    """
    获取所需信息
    :param url_and_title_list: 本次需要抓取内容的url列表
    :return: None
    """
    try:
    for url, title in url_and_title_list:
    html = get_html(url)
    selector = etree.HTML(html)
    print(selector.xpath("//a[@class='tag_item']/text()")[2:])
    try:
    type = re.findall(Type, html)[0]
    except Exception as e:
    log.error("%s电视剧分为为空!" % title)
    type = ''
    content = url + ',' + title + ',' + type
    print(content)
    with open('result.txt', 'a', encoding='gbk') as f:
    f.writelines(content+' ')
    log.debug("向文件中写入: %s" % content)
    except Exception:
    log.error("获取信息过程中出错,出错原因如下")
    log.error(traceback.print_exc())


    def get_all_url():
    """
    获取所有电视剧下所有url
    :return: None
    """
    page_number_list = random.sample(range(117), 117)
    for page_number in page_number_list:
    log.debug("开始爬去电视剧下第%d页内容" % page_number)
    url = 'http://v.qq.com/x/list/tv?sort=18&iarea=-1&offset={offset}'.format(offset=page_number * 30)
    html = get_html(url)
    if html != None:
    seletor = etree.HTML(html)
    child_url_list = seletor.xpath("//ul[@class='figures_list']/li/a/@href")
    name = seletor.xpath("//strong[@class='figure_title']/a/@title")
    get_info(zip(child_url_list, name))


    if __name__ == "__main__":
    get_all_url()
    with open('result.txt', 'a', encoding='utf-8') as f:
    f.writelines('https://v.qq.com/x/cover/5tjct4561pq7zan.html,热剧精彩周边,"创意剪辑"')
  • 相关阅读:
    JS中offsetTop、clientTop、scrollTop、offsetTop各属性介绍
    在MOSS中使用无刷新的日历日程控件
    VCalendar不错的开源日历项目
    非常适用的Exchange 2007 Web Services
    在C#中实现DateDiff功能
    Div被Select挡住的解决办法
    安装Project Server2007出现错误
    vs2005中调试js(转)
    CrystalReports在MOSS下的新问题:来自磁盘上的图片不能显示
    关于多级审批工作流的问题描述
  • 原文地址:https://www.cnblogs.com/crawer-1/p/8266730.html
Copyright © 2011-2022 走看看