zoukankan      html  css  js  c++  java
  • Python 自用代码(某方标准类网页源代码清洗)

    用于mongodb中“标准”数据的清洗,数据为网页源代码,须从中提取:

    标准名称,标准外文名称,标准编号,发布单位,发布日期,状态,实施日期,开本页数,采用关系,中图分类号,中国标准分类号,国际标准分类号,国别,关键词,摘要,替代标准。

    提取后组成字典存入另一集合。

    #coding=utf-8
    from pymongo import MongoClient
    from lxml import etree
    import requests
    
    s = [u'标准编号:',u'发布单位:',u'发布日期:',u'状态:',u'实施日期:',u'开本页数:',u'采用关系:',
        u'中图分类号:',u'中国标准分类号:',u'国际标准分类号:',u'国别:',u'关键词:',u'摘要:']
    
    # 获取数据库
    def get_db():
        client = MongoClient('IP', 27017)
        db = client.wanfang
        db.authenticate("用户名","密码") 
        return db
    
    # 获取第num条数据
    def get_data(table, num):
        i = 1
        for item in table.find({}, {"content":1,"_id":0}):
            if i==num:
                if item.has_key('content') and item['content']:
                    return item['content']
            else:
                i+=1
                continue
    
    # 列表转字符串
    def list_str(list):
        if len(list)!=0:
            return list[0]
        else:
            return ""
    
    # 提取分类号
    def code_ls(list):
        if len(list)!=0:
            ls = list[0].split()
            shanchu = []
            for i in ls:
                if ("("in i) or (")"in i) or (""in i) or(""in i):
                    shanchu.append(i)
            for i in shanchu:
                ls.remove(i)
            return ls
        else:
            return ""
    
    # 构造关键词列表
    def keywords_ls(list):
        if len(list)!=0:
            return list
        else:
            return ""
    
    # 替代标准
    def replace_str(replace):
        if replace!="":
            ls = [i.strip().replace("
    ", "") for i in replace]
            if len(ls)!=0:
                return ls[0][5:]
            else:
                return ""
        else:
            return ""
    
    # 提取摘要
    def summary_str(list):
        if len(list)!=0:
            if list[0][0]!="<":
                return list[0]
            else:
                return ""
        else:
            return ""
    
    # 调整日期格式
    def date_str(list):
        if len(list)!=0:
            year = list[0].find(u'')
            month = list[0].find(u'')
            day = list[0].find(u'')
            if month-year==2:
                list[0] = list[0].replace(u"",u"年0")
            if day-month==2:
                list[0] = list[0].replace(u"",u"月0")
            return list[0].replace(u"","").replace(u"","-").replace(u"","-")
        else:
            return ""
    
    # 调整采标格式
    def adopted_ls(string, ls):
        dc = {}
        loc = string.find(',')
        if loc==-1:
            return ls
        else:
            dc["code"] = string[:loc].strip()
            dc["type"] = string[loc+1:loc+4]
            ls.append(dc)
            return adopted_ls(string[loc+4:],ls)
    
    # 构造标准入库字典
    def standard_dict(html):
        dc = {}
        tree = etree.HTML(html)
        # 标准名称
        dc["title"] = list_str(tree.xpath("//h1/text()"))
        # 外文名称
        dc["title_eng"] = list_str(tree.xpath("//h2/text()"))
        # 标准编号
        dc["standard_number"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[0])))
        # 发布单位
        dc["publishing_department"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[1])))
        # 发布日期
        dc["release_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[2])))
        # 状态
        dc["state"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[3])))
        # 实施日期
        dc["enforcement_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[4])))
        # 开本页数
        dc["pages"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[5])))
        # 采用关系
        dc["adopted"] = adopted_ls(list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[6]))), [])
        # 中图分类号
        dc["clc"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[7])))
        # 中国标准分类号
        dc["ccs"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[8])))
        # 国际标准分类号
        dc["ics"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[9])))
        # 国别
        dc["country"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[10])))
        # 关键词
        dc["keywords"] = keywords_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[11])))
        # 摘要
        dc["summary"] = summary_str(tree.xpath("//span[text()='%s']/parent::*/following-sibling::*/text()"%(s[12])))
        # 替代标准
        dc["replace_for"] = replace_str(tree.xpath("//div[@id='replaceStandard']//child::*//text()"))
        return dc
    
    # 主函数
    def main():
        db = get_db()
        collection=db.standard
        collection2 = db.standard_cleaned
        for item in collection.find({}, {"content":1,"_id":0}):
            if item.has_key('content') and item['content']:
                dc = standard_dict(item['content'])
                collection2.insert(dc)
    
    if __name__ == '__main__':
        main()
        
        # 以下代码用于测试清洗特定一条数据
        # db = get_db()
        # collection=db.standard
        # collection2 = db.standard_cleaned
        # data = get_data(collection, 8)
        # dc = standard_dict(data)
        # collection2.insert(dc)
        # for k,v in dc.items():
        #     print k,v
    
        # # 以下代码用于测试提取摘要
        # data = requests.get('http://d.wanfangdata.com.cn/Standard/ISO%208528-5-2013')
        # dc = standard_dict(data.text)
        # for k,v in dc.items():
        #     print k,v
    
        # # 以下代码用于测试修改日期格式
        # l1 = [u"2017年6月28日"]
        # l2 = [u"2017年10月27日"]
        # l3 = [u"2017年12月1日"]
        # l4 = [u"2017年7月1日"]
        # print date_str(l1)
        # print date_str(l2)
        # print date_str(l3)
        # print date_str(l4)
  • 相关阅读:
    UVA 254 Towers of Hanoi
    UVA 701 The Archeologists' Dilemma
    UVA 185 Roman Numerals
    UVA 10994 Simple Addition
    UVA 10570 Meeting with Aliens
    UVA 306 Cipher
    UVA 10160 Servicing Stations
    UVA 317 Hexagon
    UVA 10123 No Tipping
    UVA 696 How Many Knights
  • 原文地址:https://www.cnblogs.com/zhangtianyuan/p/7086107.html
Copyright © 2011-2022 走看看