example: http://xyzp.haitou.cc/article/722427.html
首先是直接下载好每个页面,可以使用 os.system( "wget "+str(url)) 或者urllib2.urlopen(url) ,很简单不赘述。
然后,重头戏,进行信息抽取:
#!/usr/bin/env python # coding=utf-8 from bs4 import BeautifulSoup import codecs import sys import os reload(sys) sys.setdefaultencoding("utf-8") import re from pymongo import MongoClient def get_jdstr(fname): soup = "" retdict = {} with open(fname) as fr: soup = BeautifulSoup(fr.read().replace('""','"')) jdstr = soup.get_text() retdict["inc_name"] = soup.title.string.split()[0] retdict["page_content"] = soup.find_all("div","panel-body panel-body-text")[0].get_text() retdict["index_url"] = re.search("http://xyzp.haitou.cc/article/d+.html",jdstr).group() retdict["info_from"] = soup.find_all("p","text-ellipsis")[0].contents[1].get_text() retdict["workplace"] = soup.find_all("p","text-ellipsis")[1].contents[1].get_text() retdict["info_tag"] = soup.find_all("p","text-ellipsis")[2].contents[1].get_text() retdict["pub_time"] = soup.find_all("p","text-ellipsis")[3].contents[1].get_text() return retdict def JD_extr(): fnames = [ fname for fname in os.listdir("./") if fname.endswith(".html") ] fw = codecs.open("tmp_jd_haitou_clean.csv","w","utf-8") res = [] for fname in fnames[1:500]: tmp = [] retdict = get_jdstr(fname) res.append(retdict) for k,v in retdict.iteritems(): tmp.append(v) fw.write(" , ".join(tmp)+" ") fw.write("==="*20+" ") print fname,"done!" return res def change2html(): fnames = [ fname for fname in os.listdir("./") if fname.endswith(".txt") ] for fname in fnames: cmd = "mv "+str(fname) +" "+fname[:-3]+"html" print cmd os.system(cmd) def store2mongodb(): client = MongoClient("localhost",27017) db = client.JD_Haitou documents = JD_extr() for d in documents: db.haitouJD.insert(d) mycol = db["haitouJD"] print mycol.count() def split_jd_test_data(fname='./tmp_jd_haitou_clean.csv'): fw = codecs.open('./split_jd_res.csv','w','utf-8') fr = codecs.open(fname,'r','utf-8') indexurl = re.compile("http://xyzp.haitou.cc/article/d+.html") for line in fr: if indexurl.search(line): url = indexurl.search(line).group() cnt = '1' #默认为1 fw.write(url+" "+cnt+" ") fr.close() fw.close() if __name__ == "__main__": JD_extr() # 抽取后存入文件 store2mongodb() split_jd_test_data() print "done"