zoukankan      html  css  js  c++  java
  • Python 自用代码(知网会议论文网页源代码清洗)

    #coding=utf-8
    from pymongo import MongoClient
    from lxml import etree
    import requests
    
    jigou = u"
          【机构】
          "
    zuozhe = u"
            【作者】
              "
    
    # 获取数据库
    def get_db():
        client = MongoClient('localhost', 27017)
        db = client.cnki
        db.authenticate("用户名","密码") 
        return db
    
    # 获取第num条数据
    def get_data(table, num):
        i = 1
        for item in table.find({}, {"html":1,"_id":0}):
            if i==num:
                if item.has_key('html') and item['html']:
                    return item['html']
            else:
                i+=1
                continue
    
    # 列表首元素转字符串
    def list_str(list):
        if len(list)!=0:
            return list[0]
        else:
            return ""
    
    # 作者英文名,机构英文名
    def en_ls(list, length1, length2):
        if len(list)!=0:
            list = list[0].replace(u"【Author】","").replace("
    ","").strip().split(";")
            if len(list)==(length2+length1)+1:
                return list2str(list[:length1]), list2str(list[length1:-1])
            else:
                return "", ""
        else:
            return "", ""
    
    def hyxx(list):
        if len(list)!=0:
            hylmc,hymc,hysj,hydd,flh,zbdw = "","","","",[],""
            for item in list:
                if u"【会议录名称】" in item:
                    hylmc = item.replace(u"【会议录名称】","").replace("
    ","").strip()
                    continue
                if u"【会议名称】" in item:
                    hymc = item.replace(u"【会议名称】","").replace("
    ","").strip()
                    continue
                if u"【会议时间】" in item:
                    hysj = item.replace(u"【会议时间】","").replace("
    ","").strip()
                    continue
                if u"【会议地点】" in item:
                    hydd = item.replace(u"【会议地点】","").replace("
    ","").strip()
                    continue
                if u"【分类号】" in item:
                    flh = item.replace(u"【分类号】","").replace("
    ","").strip()
                    continue
                if u"【主办单位】" in item:
                    zbdw = item.replace(u"【主办单位】","").replace(u"",";").replace("
    ","").strip()
                    continue
            return hylmc,hymc,hysj,hydd,flh,zbdw
        else:
            return "","","","","",""
    
    # 列表转字符串
    def list2str(list):
        if len(list)!=0:
            return ";".join(list)
        else:
            return ""    
    
    # 构造论文入库字典
    def standard_dict(html):
        dc = {}
        print 1
        # print html
        tree = etree.HTML(html)
        # 论文名称
        dc["title"] = list_str(tree.xpath("//span[@id='chTitle']/text()"))
        # 外文名称
        dc["title_eng"] = list_str(tree.xpath("//span[@id='enTitle']/text()"))
        # 作者
        dc["author"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))
        # 作者数量
        length1 = len(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))
        # 机构名称
        dc["organization"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%jigou))
        # 机构数量
        length2 = len(tree.xpath("//p[text()='%s']/a/text()"%jigou))
        # 作者英文名, 机构英文名
        dc["author_eng"], dc["organization_eng"] = en_ls(tree.xpath("//p[@id='au_en']/text()"), length1, length2)
        # 摘要
        dc["summary"] = list_str(tree.xpath("//span[@id='ChDivSummary']/text()"))
        # 英文摘要
        dc["summary_eng"] = list_str(tree.xpath("//span[@id='EnChDivSummary']/text()"))
        # 关键词
        dc["keywords"] = list2str(tree.xpath("//div[@class='keywords']/span[1]/a/text()"))
        # 英文关键词
        dc["keywords_eng"] = list2str(tree.xpath("//div[@class='keywords']/span[2]/a/text()"))
        # 会议信息
        dc["proceeding_title"],dc["conference_title"],dc["conference_date"],dc["conference_place"],dc["huiyflh"],dc["conference_org"] = hyxx(tree.xpath("//div[@class='summary']/ul/li/text()"))
        if dc["proceeding_title"]=="":
            print 2
            dc["proceeding_title"] = list_str(tree.xpath("//div[@class='summary']/ul[1]/li/a/text()"))
        
        return dc
    
    # 主函数
    def main():
        db = get_db()
        collection=db.conference
        collection2 = db.conference_cleaned
        for item in collection.find({}, {"html":1,"_id":0}):
            if item.has_key('html') and item['html']:
                dc = standard_dict(item['html'])
                collection2.insert(dc)
    
    
    if __name__ == '__main__':
        main()
        # 以下代码用于测试清洗特定一条数据
        # db = get_db()
        # collection=db.conference
        # data = get_data(collection, 1)
        # dc = standard_dict(data)
        # for k,v in dc.items():
        #     print k,v
  • 相关阅读:
    【SVN解决代码提交冲突】https://www.cnblogs.com/aaronLinux/p/5521844.html
    查询有2门及以上不及格科目的学生姓名及其平均成绩
    【Python】split
    【Python】文件处理
    【robotframework】打开浏览器提示:NoSuchWindowException: Message: Unable to get browser
    定位到新窗口
    8月1号
    【定位】https://blog.csdn.net/cyjs1988/article/details/76284289
    【Robotframework】脚本跑完后自动发送邮件
    jQuery Mobile Data 属性
  • 原文地址:https://www.cnblogs.com/zhangtianyuan/p/7192845.html
Copyright © 2011-2022 走看看