zoukankan      html  css  js  c++  java
  • 使用bs4对海投网内容信息进行提取并存入mongodb数据库

     example:    http://xyzp.haitou.cc/article/722427.html

    首先是直接下载好每个页面,可以使用 os.system( "wget "+str(url))  或者urllib2.urlopen(url) ,很简单不赘述。

    然后,重头戏,进行信息抽取:

    #!/usr/bin/env python
    # coding=utf-8
    
    from bs4 import BeautifulSoup
    import codecs
    import sys
    import os
    reload(sys)
    sys.setdefaultencoding("utf-8")
    import re
    
    from pymongo import MongoClient
    
    def get_jdstr(fname):
        soup = ""
        retdict = {}
        with open(fname) as fr:
            soup = BeautifulSoup(fr.read().replace('""','"'))
        
        jdstr = soup.get_text()
        
        retdict["inc_name"] = soup.title.string.split()[0]
        retdict["page_content"] = soup.find_all("div","panel-body panel-body-text")[0].get_text()
        retdict["index_url"] = re.search("http://xyzp.haitou.cc/article/d+.html",jdstr).group()
        retdict["info_from"] = soup.find_all("p","text-ellipsis")[0].contents[1].get_text()
        retdict["workplace"] = soup.find_all("p","text-ellipsis")[1].contents[1].get_text()
        retdict["info_tag"] = soup.find_all("p","text-ellipsis")[2].contents[1].get_text()
        retdict["pub_time"] = soup.find_all("p","text-ellipsis")[3].contents[1].get_text()
    
        return retdict
    
    
    
    def JD_extr():
        fnames = [ fname  for fname in os.listdir("./") if fname.endswith(".html") ]
        fw = codecs.open("tmp_jd_haitou_clean.csv","w","utf-8")
        res = []
        for fname in fnames[1:500]:
            tmp = []
            retdict =  get_jdstr(fname)
            res.append(retdict)
            for k,v in retdict.iteritems():
                tmp.append(v)
            fw.write(" , ".join(tmp)+"
    ")
            fw.write("==="*20+"
    ") 
        print fname,"done!"
        return res
    
    
    
    def change2html():
        fnames = [ fname for fname in os.listdir("./") if fname.endswith(".txt") ]
        for fname in fnames:
            cmd = "mv "+str(fname) +" "+fname[:-3]+"html"
            print cmd
            os.system(cmd)
    
    
    def store2mongodb():
        client = MongoClient("localhost",27017)
        db = client.JD_Haitou
        
        documents = JD_extr()
        for d in documents:
            db.haitouJD.insert(d)
    
        mycol = db["haitouJD"]
        print mycol.count()
    
    
    
    def split_jd_test_data(fname='./tmp_jd_haitou_clean.csv'):
        fw = codecs.open('./split_jd_res.csv','w','utf-8')
        fr = codecs.open(fname,'r','utf-8')
        indexurl = re.compile("http://xyzp.haitou.cc/article/d+.html")
        for line in fr: 
            if indexurl.search(line):
                url = indexurl.search(line).group()
                cnt = '1'  #默认为1
                fw.write(url+"	"+cnt+"
    ")
        fr.close()
        fw.close()
    
    
    
    
    if __name__ == "__main__":
       JD_extr()  # 抽取后存入文件
        store2mongodb()
        split_jd_test_data()
        print "done"
    每天一小步,人生一大步!Good luck~
  • 相关阅读:
    Longest Palindromic Substring
    PayPal MLSE job description
    Continuous Median
    Remove Duplicates From Linked List
    Valid IP Address
    Longest substring without duplication
    Largest range
    Subarray sort
    Multi String Search
    Suffix Trie Construction
  • 原文地址:https://www.cnblogs.com/jkmiao/p/4846799.html
Copyright © 2011-2022 走看看