存储海量数据
数据持久化的首选方案应该是关系型数据库,关系型数据库的产品很多,包括:Oracle、MySQL、SQLServer、PostgreSQL等。如果要存储海量的低价值数据,文档数据库也是不错的选择,MongoDB是文档数据库中的佼佼者,有兴趣的读者可以自行研究。
下面的代码演示了如何使用MySQL来保存从知乎发现上爬取到的链接和页面。
create database zhihu default charset utf8; create user "zhihu"@"%" identified by "zhihu.618"; grant all privileges on zhihu.* to "zhihu"@"%"; flush privileges; use zhihu; create table tb_explore ( id integer auto_increment, url varchar(1024) not null, page longblob not null, digest char(48) unique not null, idate datetime default now(), primary key (id) );
import hashlib import pickle import re import zlib from urllib.parse import urljoin import pymysql from bs4 import BeautifulSoup import requests conn = pymysql.connect(host="localhost", # port=3306, user="zhihu", password="zhihu.618", database="zhihu", charset="utf8", autocommit=True) def write_to_db(url, page, digest): try: with conn.cursor() as cursor: cursor.execute("insert into tb_explore (url, page, digest) values (%s, %s, %s)", (url, page, digest)) conn.commit() except pymysql.MySQLError as err: print(err) def main(): base_url = "https://www.zhihu.com/" seed_url = urljoin(base_url, "explore") headers = {"user-agent": "Baiduspider"} try: resp = requests.get(seed_url, headers=headers) soup = BeautifulSoup(resp.text, "lxml") href_regex = re.compile(r"^/question") for a_tag in soup.find_all("a", {"href": href_regex}): href = a_tag.attrs["href"] full_url = urljoin(base_url, href) digest = hashlib.sha1(full_url.encode()).hexdigest() html_page = requests.get(full_url, headers=headers).text zipped_page = zlib.compress(pickle.dumps(html_page)) write_to_db(full_url, zipped_page, digest) finally: conn.close() if __name__ == '__main__': main()
数据缓存
通过《网络数据采集和解析》一文,我们已经知道了如何从指定的页面中抓取数据,以及如何保存抓取的结果,但是我们没有考虑过这么一种情况,就是我们可能需要从已经抓取过的页面中提取出更多的数据,重新去下载这些页面对于规模不大的网站倒是问题也不大,但是如果能够把这些页面缓存起来,对应用的性能会有明显的改善。下面的例子演示了如何使用Redis来缓存知乎发现上的页面。
import hashlib import pickle import re import zlib from urllib.parse import urljoin import redis from bs4 import BeautifulSoup import requests def main(): base_url = "https://www.zhihu.com/" seed_url = urljoin(base_url, "explore") client = redis.Redis(host="1.2.3.4", port=6379, password="1qaz2wsx") headers = {"user-agent": "Baiduspider"} resp = requests.get(seed_url, headers=headers) soup = BeautifulSoup(resp.text, "lxml") href_regex = re.compile(r"^/question") for a_tag in soup.find_all("a", {"href": href_regex}): href = a_tag.attrs["href"] full_url = urljoin(base_url, href) field_key = hashlib.sha1(full_url.encode()).hexdigest() if not client.hexists("spider:zhihu:explore", field_key): html_page = requests.get(full_url, headers=headers).text zipped_page = zlib.compress(pickle.dumps(html_page)) client.hset("spider:zhihu:explore", field_key, zipped_page) print("Total %d question pages found." % client.hlen("spider:zhihu:explore")) if __name__ == '__main__': main()