1 import urllib.request 2 import re 3 4 url = 'http://daily.zhihu.com/' 5 def get_html(url): 6 html = urllib.request.urlopen(url).read() 7 html = html.decode('utf-8') 8 return html 9 10 def get_url_num(html): 11 res = re.compile('<a href="/story/(.*?)"') 12 url_nums = re.findall(res,html) 13 urls = [] 14 for url_num in url_nums: 15 urls.append('http://daily.zhihu.com/story/' + url_num) 16 return urls 17 18 def get_content(newurl): 19 newurl_html = get_html(newurl) 20 re_title = re.compile('<title>(.*?)</title>') 21 title = re.findall(re_title,newurl_html) 22 re_content = re.compile('<div class="content">\n<p>(.*?)</div>',re.S) 23 contents = re.findall(re_content,newurl_html) 24 print(title) 25 for content in contents: 26 content = content.replace('<p>','') 27 content = content.replace('</p>','') 28 content = content.replace('<strong>','') 29 print(content) 30 31 32 for newurl in get_url_num(get_html(url)): 33 get_content(newurl)
PS:
1、爬取的内容里面还有链接没有处理干净,虽然别人给了个源码,但是自己看不懂!(还要加油!↖(^ω^)↗↖(^ω^)↗)
2、视频里面说要模拟浏览器登入,但是我这里没有模拟还是可以正常的爬取(我用的是Python3)