1 # coding = utf-8
2
3 __autor__ = 'litao'
4
5 import random, requests
6 import logging
7 import traceback
8 import time
9 import re
10 from lxml import etree
11 from logMaker import Logger
12
13 USER_AGENTS = [
14 "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
15 "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
16 "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
17 "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
18 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
19 "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
20 "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
21 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
22 "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
23 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
24 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
25 "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
26 "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
27 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
28 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
29 "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
30 ]
31
32 headers = {
33 "User-Agent": random.choice(USER_AGENTS)
34 }
35
36 log = Logger('spider.log', logging.ERROR, logging.DEBUG)
37 Type = re.compile(r'subtype":[(.*?)],')
38
39
40 def get_html(url, agent=True):
41 """
42 获取url对应的网页的源码
43 :param url: url
44 :return: html
45 """
46 try:
47 html = requests.get(url=url, headers=headers, timeout=10).text
48 log.debug("url: %s请求成功!" % url)
49 except Exception:
50 if agent == True:
51 log.error("url: %s访问出错,出错原因如下即将重试" % url)
52 log.error(traceback.print_exc())
53 get_html(url, agent=False)
54 if agent == False:
55 html = None
56 time.sleep(random.randint(5,9))#random.randint(5,15),random.randint(16,35)
57 return html
58
59
60 def get_info(url_and_title_list):
61 """
62 获取所需信息
63 :param url_and_title_list: 本次需要抓取内容的url列表
64 :return: None
65 """
66 try:
67 for url, title in url_and_title_list:
68 html = get_html(url)
69 selector = etree.HTML(html)
70 print(selector.xpath("//a[@class='tag_item']/text()")[2:])
71 try:
72 type = re.findall(Type, html)[0]
73 except Exception as e:
74 log.error("%s电视剧分为为空!" % title)
75 type = ''
76 content = url + ',' + title + ',' + type
77 print(content)
78 with open('result.txt', 'a', encoding='gbk') as f:
79 f.writelines(content+'
')
80 log.debug("向文件中写入: %s" % content)
81 except Exception:
82 log.error("获取信息过程中出错,出错原因如下")
83 log.error(traceback.print_exc())
84
85
86 def get_all_url():
87 """
88 获取所有电视剧下所有url
89 :return: None
90 """
91 page_number_list = random.sample(range(117), 117)
92 for page_number in page_number_list:
93 log.debug("开始爬去电视剧下第%d页内容" % page_number)
94 url = 'http://v.qq.com/x/list/tv?sort=18&iarea=-1&offset={offset}'.format(offset=page_number * 30)
95 html = get_html(url)
96 if html != None:
97 seletor = etree.HTML(html)
98 child_url_list = seletor.xpath("//ul[@class='figures_list']/li/a/@href")
99 name = seletor.xpath("//strong[@class='figure_title']/a/@title")
100 get_info(zip(child_url_list, name))
101
102
103 if __name__ == "__main__":
104 get_all_url()
105 with open('result.txt', 'a', encoding='utf-8') as f:
106 f.writelines('https://v.qq.com/x/cover/5tjct4561pq7zan.html,热剧精彩周边,"创意剪辑"')
1 # coding = utf-8
2
3 __autor__ = 'litao'
4
5 import logging
6
7
8 class Logger:
9 def __init__(self, path, clevel=logging.DEBUG, Flevel=logging.DEBUG):
10 self.logger = logging.getLogger(path)
11 self.logger.setLevel(logging.DEBUG)
12 fmt = logging.Formatter('[%(asctime)s] [%(levelname)s] %(message)s', '%Y-%m-%d %H:%M:%S')
13 # 设置CMD日志
14 sh = logging.StreamHandler()
15 sh.setFormatter(fmt)
16 sh.setLevel(clevel)
17 # 设置文件日志
18 fh = logging.FileHandler(path)
19 fh.setFormatter(fmt)
20 fh.setLevel(Flevel)
21 self.logger.addHandler(sh)
22 self.logger.addHandler(fh)
23
24 def debug(self, message):
25 self.logger.debug(message)
26
27 def info(self, message):
28 self.logger.info(message)
29
30 def war(self, message):
31 self.logger.warn(message)
32
33 def error(self, message):
34 self.logger.error(message)
35
36 def cri(self, message):
37 self.logger.critical(message)
38
39
40 if __name__ == '__main__':
41 logyyx = Logger('yyx.log', logging.ERROR, logging.DEBUG)
42 logyyx.debug('一个debug信息')
43 logyyx.info('一个info信息')
44 logyyx.war('一个warning信息')
45 logyyx.error('一个error信息')
46 logyyx.cri('一个致命critical信息')
# coding = utf-8
__autor__ = 'litao'
import random, requests
import logging
import traceback
import time
import re
from lxml import etree
from logMaker import Logger
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
headers = {
"User-Agent": random.choice(USER_AGENTS)
}
log = Logger('spider.log', logging.ERROR, logging.DEBUG)
Type = re.compile(r'subtype":[(.*?)],')
def get_html(url, agent=True):
"""
获取url对应的网页的源码
:param url: url
:return: html
"""
try:
html = requests.get(url=url, headers=headers, timeout=10).text
log.debug("url: %s请求成功!" % url)
except Exception:
if agent == True:
log.error("url: %s访问出错,出错原因如下即将重试" % url)
log.error(traceback.print_exc())
get_html(url, agent=False)
if agent == False:
html = None
time.sleep(random.randint(5,9))#random.randint(5,15),random.randint(16,35)
return html
def get_info(url_and_title_list):
"""
获取所需信息
:param url_and_title_list: 本次需要抓取内容的url列表
:return: None
"""
try:
for url, title in url_and_title_list:
html = get_html(url)
selector = etree.HTML(html)
print(selector.xpath("//a[@class='tag_item']/text()")[2:])
try:
type = re.findall(Type, html)[0]
except Exception as e:
log.error("%s电视剧分为为空!" % title)
type = ''
content = url + ',' + title + ',' + type
print(content)
with open('result.txt', 'a', encoding='gbk') as f:
f.writelines(content+'
')
log.debug("向文件中写入: %s" % content)
except Exception:
log.error("获取信息过程中出错,出错原因如下")
log.error(traceback.print_exc())
def get_all_url():
"""
获取所有电视剧下所有url
:return: None
"""
page_number_list = random.sample(range(117), 117)
for page_number in page_number_list:
log.debug("开始爬去电视剧下第%d页内容" % page_number)
url = 'http://v.qq.com/x/list/tv?sort=18&iarea=-1&offset={offset}'.format(offset=page_number * 30)
html = get_html(url)
if html != None:
seletor = etree.HTML(html)
child_url_list = seletor.xpath("//ul[@class='figures_list']/li/a/@href")
name = seletor.xpath("//strong[@class='figure_title']/a/@title")
get_info(zip(child_url_list, name))
if __name__ == "__main__":
get_all_url()
with open('result.txt', 'a', encoding='utf-8') as f:
f.writelines('https://v.qq.com/x/cover/5tjct4561pq7zan.html,热剧精彩周边,"创意剪辑"')