1 原理
通过向 baidu.com 发送 GET 请求 和需要查询的 params 自动获取数据并提取需要的信息
2 代码
# -*- coding: utf-8 -*- import urlparse from bs4 import BeautifulSoup from collections import OrderedDict from my_request import MyRequest class CrawlerBaidu(MyRequest): host = 'http://www.baidu.com/s' cookies = { } def __init__(self): super(CrawlerBaidu, self).__init__() self.params = {'wd': u'intitle:"xxxx 博客园"'.encode('utf-8'), 'ie': 'utf-8' } self.headers = {'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/30.0.1581.2 Safari/537.36', 'Host': 'www.baidu.com', 'Connection': 'keep - alive', 'Accept': 'text / html, application / xhtml + xml, application / xml;' 'q = 0.9, image / webp, image / apng, * / *;q = 0.8', 'Accept - Encoding': 'gzip, deflate, br', 'Accept - Language': 'zh - CN, zh' } self.data = OrderedDict() # {title1:url1, title2:url2, ...} self.web_page = [] # [1, 2, 3, ...] @MyRequest.validate_html def parser_html(self, html): u_text = html.text soup = BeautifulSoup(u_text, 'lxml') title_list = soup.find_all('h3', class_="t") for title in title_list: title_name = ''.join(title.find('a').stripped_strings).encode('utf-8') url_direct = title.find('a')['href'] url = self.searchURL(url_direct) if url: self.data[title_name] = url.encode('utf-8') else: continue def searchURL(self, url_direct): url = 'http://www.baidu.com/link?url=WFdYThWbSNQdbd-r0VeyRYampvt-nDAaH-vT0-xxxx ' html = self.request_url(url_direct) # u_text = html.text # u_url = re.search(r"URL='.+'", u_text).group() # url = u_url[5:-1].encode('utf-8') real_url = html.url url_structure = urlparse.urlparse(real_url) # print real_url if url_structure.netloc == 'www.cnblogs.com': return real_url else: return '' # default is 5th page @MyRequest.validate_html def searchPage(self, html): for i in xrange(1, 6): self.web_page.append(str(i*10)) pass def save_file(self): with open('cnblogs_url.txt', 'wb') as f: for k, v in self.data.items(): f.write('%s %s '%(k, v)) def run(self, content=None): html = self.request_url(self.host, params=self.params, cookies=self.cookies) self.searchPage(html) # save page self.parser_html(html) # add first page # add every page and reqeust for page in self.web_page: print 'Page: %s' % page self.params['pn'] = page html = self.request_url(self.host, params=self.params, cookies=self.cookies) self.parser_html(html) self.save_file()
2.2 采集思路:
通过向baidu.com/s?wd=params 发送GET请求,获取了首次查询的内容和页码数并存储。
根据页码数进行遍历请求并储存内容。
3 注意
3.1 使用 intitle:“xxxx” 查询方法,做到精确查询。
3.2其中cookies是需要手动输入自己的,否者无法做到精确查询。
3.3 MyRequest模块是我自己封装了requests库,主要提供了各种请求方法,大家自己写就可以。