zoukankan      html  css  js  c++  java
  • 提取mongodb中论文的信息,填入mysql,加快统计速度

    1.创建mysql的alzheimer表,包括pmc_id,journal,title,abstract,name,authorinfor,pun_year,keyword,reference信息。

    #encoding = utf-8
    import pymysql
    import json
    
    def input_from_json(filename):
        with open(filename,'r') as file:
            data = json.loads(file.read())
            return data
    conn= pymysql.connect(
            host='localhost',
            port = 3306,
            user='root',
            passwd='',
            db='python'
            )
    cur = conn.cursor()
    
    def createdb():
        cur.execute('create table alzheimer(id int AUTO_INCREMENT,PRIMARY KEY (id))')
        cur.execute('alter table alzheimer add pmc_id int')
        cur.execute('alter table alzheimer add journal text')
        cur.execute('alter table alzheimer add title text')
        cur.execute('alter table alzheimer add abstract text')
        cur.execute('alter table alzheimer add namestr text')
        cur.execute('alter table alzheimer add authorinfor text')
        cur.execute('alter table alzheimer add pub_year varchar(5)')
        cur.execute('alter table alzheimer add union_kwd_str text')
        cur.execute('alter table alzheimer add reference_str text')
    
    createdb()
    
    #up is all the test
    def addnewcloumn():
        cur.execute('alter table test add transactor varchar(10)')
        cur.execute('alter table ad add hasid varchar(10)')
    
    def addtomysql():
        idlist = input_from_json('id_list.json')['idList']
        values = []
        for i in range(len(idlist)):
            values.append((i,idlist[i]))
    
        cur.executemany('insert into ad  values(%s,%s)',values)
    
    
    def updatenewcol():
        idlist = input_from_json('id_list.json')['idList']
        values = []
        for i in range(len(idlist)):
            values.append(('yes',i))
        cur.executemany('update ad set hasid = (%s) where id = (%s)',values)
    
    def selectpmcid():
        sql = "SELECT pmc_id FROM ad "
        a = cur.execute(sql)
        print a
        b = cur.fetchmany(a)  #b has 7887 abstract list
    
        print b[0][0],b[2][0]
        print type(b[0][0])   #int
    
    def addnewcolunm():
        cur.execute('alter table ad add journal text')
        cur.execute('alter table ad add title text')
        cur.execute('alter table ad add abstract text')
        cur.execute('alter table ad add namestr text')
        cur.execute('alter table ad add authorinfor text')
        cur.execute('alter table ad add pub_year varchar(5)')
        cur.execute('alter table ad add union_kwd_str text')
        cur.execute('alter table ad add reference_str text')
    
    def inserttest():
        cur.execute('create table test2(id int AUTO_INCREMENT,PRIMARY KEY (id))')
        cur.execute('alter table test2 add pmc_id int')
        cur.execute('alter table test2 add title text')
    def inserttest2():
        values = []
        for i in range(10):
            values.append((i,'hello'+str(i)))
        cur.executemany('insert into test2(pmc_id,title) values(%s,%s)',values)
    
       
    conn.commit()   
    cur.close()
    conn.close()

    2.从mongodb中获取信息并且填入mysql,因为有12万条,中间可能有不正确的数据无法填入,分段插入。实际结果表明,insert语句比update语句快了将近十倍不止。最终选择新建一个完全为空的表,采用insert进行插入。

    问题:引用的信息提取的有问题。接下来要进行的工作。

    from pymongo import MongoClient
    from bs4 import BeautifulSoup
    from nltk.tokenize import MWETokenizer
    import re
    import pymysql
    import time
    
    # create link
    client = MongoClient('localhost', 27017)
    db = client['ad']   #or db = client.ad
    collection = db['xml']   #create cousin
    
    mongodb_to_mysql_list = []
    num = 0
    #first [:27810]
    #second [27811:60000]
    #third [60000:100000]   100000 cant 
    #[100001:] 9859 cant  109861cant
    #[100001:109860]
    begin = time.time()
    for item in collection.find()[109861:]:
        pmc_xml = item['xml']
        pmc_id = item['pmc_id'].encode('utf-8')
        pmc_id = int(pmc_id)
        soup = BeautifulSoup(pmc_xml,"lxml")  #if not add "lxml" will warning but not error
    
        print num
        #find the journal
        journal = soup.find('journal-title')
        if journal != None:
            journal=journal.text.encode('utf-8')
        else:
            journal = ''
        
        #find the article
        title = soup.find('article-title')
        if title != None:
            title = title.text.encode('utf-8')
        else:
            title = ''
    
        #show the author
        authornamelist = soup.find_all('contrib')
        namestr = ''
        for x in authornamelist:
            if x.surname != None:
                name = x.surname.text
                if x.surname.next_sibling != None:
                    if x.surname.next_sibling.next_sibling != None:
                        name = name +' '+ x.surname.next_sibling.next_sibling.text
            namestr = namestr+name.encode('utf-8')+','
    
        #show the firt author information
        authorinfor = ''
        authorinfor = soup.find('aff')
        if authorinfor != None:
            authorinfor =authorinfor.text.encode('utf-8')
        else:
            authorinfor = ''
    
        #show the receive time year
        pub_year = soup.find('year')
        if pub_year != None:
            pub_year=pub_year.text.encode('utf-8')
        else:
            pub_year=''
        
        #show the abstract
        a = soup.find('abstract')
        if a != None:
            if a.p != None:
                abstract = a.p.text.encode('utf-8')
            else:
                abstract = ''
    
        #show the key-words
        kwdlist = soup.find_all('kwd')
        union_kwd_str = ''
        for x in kwdlist:
            kwd = x.text.lower().encode('utf-8')
            kwdstr = re.sub(""|,|.", "", kwd)
            kwd = tuple(kwdstr.split())
            tokenizer = MWETokenizer([kwd])
            union_kwd = tokenizer.tokenize(kwdstr.split())
            if union_kwd != []:
                union_kwd_str = union_kwd_str+union_kwd[0]+','
    
        #show the reference id
        pub_id_list = soup.find_all('pub-id')
        reference_idlist = []
        reference_str = ''
        for x in pub_id_list:
            if x != None:
                reference = x.text.encode('utf-8')
                reference_idlist.append(reference)
            reference_str = reference_str+reference+','
    
        mongodb_to_mysql_list.append((pmc_id,journal,title,abstract,namestr,authorinfor,pub_year,union_kwd_str,reference_str))
    
        num += 1
    
    print num
    end1 = time.time() -begin
    print end1
    
    
    conn= pymysql.connect(
            host='localhost',
            port = 3306,
            user='root',
            passwd='',
            db='python'
            )
    cur = conn.cursor()
    #cur.executemany('update ad set journal=(%s),title=(%s),abstract=(%s),namestr=(%s),authorinfor=(%s),pub_year=(%s),union_kwd_str=(%s),reference_str=(%s) where pmc_id = (%s)',mongodb_to_mysql_list)
    #99s 100
    cur.executemany('insert into alzheimer(pmc_id,journal,title,abstract,namestr,authorinfor,pub_year,union_kwd_str,reference_str) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',mongodb_to_mysql_list)
    #8.5s 100
    
    #cur.executemany('insert into test2(pmc_id,title) values(%s,%s)',values)
    conn.commit()   
    cur.close()
    conn.close()
    
    end2 = time.time() -begin
    print end2
  • 相关阅读:
    170601、单例模式的三种水平代码(第三种最佳)
    解决打开pycharm有带图片md文件卡死问题
    Dockerfile 操作
    Docker 命令大全
    MAC
    mac 搭建selenium与ChromeDriver环境
    Mac进行 usr/bin 目录下修改权限问题,operation not permitted
    pytest文档6-fixture之yield实现teardown
    pytest文档5-fixture之conftest.py
    pytest文档4-测试用例setup和teardown
  • 原文地址:https://www.cnblogs.com/lovely7/p/6178865.html
Copyright © 2011-2022 走看看