zoukankan html css js c++ java

Python 自用代码（某方标准类网页源代码清洗）

用于mongodb中“标准”数据的清洗，数据为网页源代码，须从中提取：

标准名称,标准外文名称,标准编号,发布单位,发布日期,状态,实施日期,开本页数,采用关系,中图分类号,中国标准分类号,国际标准分类号,国别,关键词,摘要,替代标准。

提取后组成字典存入另一集合。

#coding=utf-8
from pymongo import MongoClient
from lxml import etree
import requests

s = [u'标准编号：',u'发布单位：',u'发布日期：',u'状态：',u'实施日期：',u'开本页数：',u'采用关系：',
    u'中图分类号：',u'中国标准分类号：',u'国际标准分类号：',u'国别：',u'关键词：',u'摘要：']

# 获取数据库
def get_db():
    client = MongoClient('IP', 27017)
    db = client.wanfang
    db.authenticate("用户名","密码") 
    return db

# 获取第num条数据
def get_data(table, num):
    i = 1
    for item in table.find({}, {"content":1,"_id":0}):
        if i==num:
            if item.has_key('content') and item['content']:
                return item['content']
        else:
            i+=1
            continue

# 列表转字符串
def list_str(list):
    if len(list)!=0:
        return list[0]
    else:
        return ""

# 提取分类号
def code_ls(list):
    if len(list)!=0:
        ls = list[0].split()
        shanchu = []
        for i in ls:
            if ("("in i) or (")"in i) or ("（"in i) or("）"in i):
                shanchu.append(i)
        for i in shanchu:
            ls.remove(i)
        return ls
    else:
        return ""

# 构造关键词列表
def keywords_ls(list):
    if len(list)!=0:
        return list
    else:
        return ""

# 替代标准
def replace_str(replace):
    if replace!="":
        ls = [i.strip().replace("
", "") for i in replace]
        if len(ls)!=0:
            return ls[0][5:]
        else:
            return ""
    else:
        return ""

# 提取摘要
def summary_str(list):
    if len(list)!=0:
        if list[0][0]!="<":
            return list[0]
        else:
            return ""
    else:
        return ""

# 调整日期格式
def date_str(list):
    if len(list)!=0:
        year = list[0].find(u'年')
        month = list[0].find(u'月')
        day = list[0].find(u'日')
        if month-year==2:
            list[0] = list[0].replace(u"年",u"年0")
        if day-month==2:
            list[0] = list[0].replace(u"月",u"月0")
        return list[0].replace(u"日","").replace(u"月","-").replace(u"年","-")
    else:
        return ""

# 调整采标格式
def adopted_ls(string, ls):
    dc = {}
    loc = string.find(',')
    if loc==-1:
        return ls
    else:
        dc["code"] = string[:loc].strip()
        dc["type"] = string[loc+1:loc+4]
        ls.append(dc)
        return adopted_ls(string[loc+4:],ls)

# 构造标准入库字典
def standard_dict(html):
    dc = {}
    tree = etree.HTML(html)
    # 标准名称
    dc["title"] = list_str(tree.xpath("//h1/text()"))
    # 外文名称
    dc["title_eng"] = list_str(tree.xpath("//h2/text()"))
    # 标准编号
    dc["standard_number"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[0])))
    # 发布单位
    dc["publishing_department"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[1])))
    # 发布日期
    dc["release_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[2])))
    # 状态
    dc["state"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[3])))
    # 实施日期
    dc["enforcement_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[4])))
    # 开本页数
    dc["pages"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[5])))
    # 采用关系
    dc["adopted"] = adopted_ls(list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[6]))), [])
    # 中图分类号
    dc["clc"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[7])))
    # 中国标准分类号
    dc["ccs"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[8])))
    # 国际标准分类号
    dc["ics"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[9])))
    # 国别
    dc["country"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[10])))
    # 关键词
    dc["keywords"] = keywords_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[11])))
    # 摘要
    dc["summary"] = summary_str(tree.xpath("//span[text()='%s']/parent::*/following-sibling::*/text()"%(s[12])))
    # 替代标准
    dc["replace_for"] = replace_str(tree.xpath("//div[@id='replaceStandard']//child::*//text()"))
    return dc

# 主函数
def main():
    db = get_db()
    collection=db.standard
    collection2 = db.standard_cleaned
    for item in collection.find({}, {"content":1,"_id":0}):
        if item.has_key('content') and item['content']:
            dc = standard_dict(item['content'])
            collection2.insert(dc)

if __name__ == '__main__':
    main()
    
    # 以下代码用于测试清洗特定一条数据
    # db = get_db()
    # collection=db.standard
    # collection2 = db.standard_cleaned
    # data = get_data(collection, 8)
    # dc = standard_dict(data)
    # collection2.insert(dc)
    # for k,v in dc.items():
    #     print k,v

    # # 以下代码用于测试提取摘要
    # data = requests.get('http://d.wanfangdata.com.cn/Standard/ISO%208528-5-2013')
    # dc = standard_dict(data.text)
    # for k,v in dc.items():
    #     print k,v

    # # 以下代码用于测试修改日期格式
    # l1 = [u"2017年6月28日"]
    # l2 = [u"2017年10月27日"]
    # l3 = [u"2017年12月1日"]
    # l4 = [u"2017年7月1日"]
    # print date_str(l1)
    # print date_str(l2)
    # print date_str(l3)
    # print date_str(l4)

查看全文

相关阅读:
UVA 254 Towers of Hanoi
UVA 701 The Archeologists' Dilemma
UVA 185 Roman Numerals
UVA 10994 Simple Addition
UVA 10570 Meeting with Aliens
UVA 306 Cipher
UVA 10160 Servicing Stations
UVA 317 Hexagon
UVA 10123 No Tipping
UVA 696 How Many Knights

原文地址：https://www.cnblogs.com/zhangtianyuan/p/7086107.html