1 import requests
2 from lxml import etree
3 from bs4 import BeautifulSoup
4 import os
5 from selenium import webdriver
6
7
8
9 #解析每个漫画分页并下载漫画
10 def manhua(url):
11
12
13 browser.get(url)
14
15 #获取模拟访问的页面源码
16 html=browser.page_source
17
18
19 html = etree.HTML(html)
20 img_url = html.xpath('//img[@id="mangaFile"]/@src')[0]
21 alt = html.xpath('/html/body/div[2]/div[2]/h1/a/text()')[0]
22 title = html.xpath('/html/body/div[2]/div[2]/h2/text()')[0]
23 print(img_url,alt,title)
24
25
26 # print(html)
27
28
29 path='./漫画/'+alt+'/'+title+'/'
30 if not os.path.exists(path):
31 os.makedirs(path)
32 fname=img_url.split('/')[-1]
33 # print(fname)
34
35
36 print(os.path.join(path,fname))
37
38 # request.urlretrieve(img_url,os.path.join(path,fname))
39
40 #请求图片地址
41 response = requests.get(img_url)
42 #二进制解码
43 data= response.content
44 #保存文件
45 with open(path+fname,'wb') as f:
46 f.write(data)
47 #解析获取漫画分页链接
48 def manhua_url(url):
49 response = requests.get(url)
50 response.encoding = response.apparent_encoding
51 html = response.text
52 html = etree.HTML(html)
53 # print(html)
54 #i为漫画页数
55 i = html.xpath('/html/body/div[2]/div[2]/span/text()')[1][1:-1]
56 i=int(i)
57 # print(i)
58 #找到分页规律
59 #拼接分页链接,选择用format函数
60 url = url +'/index.html?p={}'
61 # print(url)
62 for n in range(1,i+1):
63 fullurl = url.format(n)
64 print(fullurl)
65 # time.sleep(2)
66 #fullurl为所有的分页漫画链接
67 manhua(fullurl)
68
69 #解析列表页
70 def list(lb_url):
71 response = requests.get(lb_url)
72 response.encoding = response.apparent_encoding
73 html = response.text
74 html = BeautifulSoup(html,'lxml')
75 #匹配所有章节链接
76 url_list = html.select('div.subBookList ul li')
77 for url in url_list :
78 url = url.select('a')[0].get('href').split('/')[-2]
79
80 # print(url)
81 fullurl = os.path.join(lb_url,url)
82 print(fullurl)
83 #章节链接
84 manhua_url(fullurl)
85
86 # print(url_list)
87 # print(html)
88
89 #解析首页
90 def shouye():
91 #首页链接
92 base_url = 'http://www.omanhua.com/'
93 #发起请求
94 response = requests.get(base_url)
95 #解码
96 response.encoding = response.apparent_encoding
97 #获取返回的网页
98 html = response.text
99 # print(html)
100 #解析
101 html =BeautifulSoup(html,'lxml')
102 #匹配最热漫画链接
103 url_list = html.select('ul#cartoon_image_show1 li')
104 for url in url_list:
105 # print(url)
106 url = url.select('a')[0].get('href')[1:]
107 # alt = url.select('a')
108 # print(alt)
109 #拼接链接
110 fullurl = os.path.join(base_url,url)
111 print(fullurl)
112
113 list(fullurl)
114 if __name__ == '__main__':
115 # 用自动测试模块selenium模拟浏览器访问,这里用谷歌 图片加载获取不到图片链接
116 #后面的路径是chorm驱动路径
117 browser = webdriver.Chrome(executable_path=r'C:UserszhaozhiDesktopchromedriver.exe')
118 shouye()
刚开始自学爬虫不久,代码可能写的有点繁琐,希望和大家一起学习学习进步