zoukankan      html  css  js  c++  java
  • 美国在研新药_读取单个PDF

     sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)

    QQ:231469242

    读取下载美国在研新药PDF内数据:unii,分子式,分子重量,药品名,who,编码,。。。。

    PDF无逻辑规则,不能百分之百提取,只能部分提取

    几个默认字段为空



    # -*- coding: utf-8 -*- """ io.open() is the preferred, higher-level interface to file I/O. It wraps the OS-level file descriptor in an object that you can use to access the file in a Pythonic manner. os.open() is just a wrapper for the lower-level POSIX syscall. It takes less symbolic (and more POSIX-y) arguments, and returns the file descriptor (a number) that represents the opened file. It does not return a file object; the returned value will not have read() or write() methods. """ import re from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams # pip3 install pdfminer3k from io import StringIO from io import open #pdf文件名 pdfFilename="atesidorsen sodium.pdf" #文件名前缀 frontName="usan/2016/" #商标文件名 trademark_filename="trademarks.txt" #赞助商文件名 sponsor_filename="sponsor.txt" #读取PDF数据 def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content #规范PDF数据 def Format(str1): list2=[] #分割成列表 list1=str1.split(" ") for i in list1: #if i=="/n": if i=='' or i==' 'or i==' ': continue list2.append(i) return list2 #提取me_usan,药品名 def Get_me_usan(the_list_data): return the_list_data[0] #提取me_therapeutic def Get_me_therapeutic(the_list_data): for i in the_list_data: if "Treatment of" in i: return i #提取me_chemical1 分子式1 def Get_me_chemical1(the_list_data): for i in the_list_data: if "1. " in i: return i return "" #提取me_chemical2 分子式2 def Get_me_chemical2(the_list_data): for i in the_list_data: if "2. " in i: return i return "" #匹配分子式 def Re_formula(str1): #匹配正在表达式 re_formula=re.compile(r'C(d)+H(d)+') mo1=re_formula.search(str1) if mo1!=None: return True return False #提取me_mo_formula,特征包含碳氢CH元素 def Get_me_mo_formula(the_list_data): for i in the_list_data: #转换为大写 i=i.upper() value=Re_formula(i) if value==True: return i return "" #提取分子质量me_mo_weight,如果出现MOLECULAR WEIGHT,且下一个值是数字或浮点数,就提取下一个值 def Get_me_mo_weight(the_list_data): for count in range(len(the_list_data)): #如果出现MOLECULAR WEIGHT,则提取下一个值 if 'MOLECULAR WEIGHT' in the_list_data[count]: value=the_list_data[count+1] if type(eval(value)) == int or type(eval(value)) == float: return value return "" #从trademarks.txt搜索数据 def Get_txt_contents(filename): file=open(filename) content=file.readlines() content1=[i.replace(" ","") for i in content] return content1 #提取me_trademark,从trademarks.txt搜索数据 def Get_me_trademark(the_list_data): for i in the_list_data: i=i.strip(" ") for k in list_trademarks: if k in i: return i return "" #提取me_sponsor,从sponsor.txt搜索数据 def Get_me_sponsor(the_list_data): for i in the_list_data: i=i.strip(" ") for k in list_sponsors: if k in i: return i return "" #匹配CAS正则表达式 def Re_CAS(str1): re_CAS=re.compile(r'(d)+-(d)+-(d)+') mo1=re_CAS.search(str1) if mo1!=None: return True return False #提取CAS def Get_CAS(the_list_data): for i in the_list_data: value=Re_CAS(i) if value==True: return i return "" #匹配WHO正则表达式 def Re_WHO(str1): re_WHO=re.compile(r'(d)+') mo1=re_WHO.search(str1) if mo1!=None: return True return False #提取WHO def Get_WHO(the_list_data): for count in range(len(the_list_data)): #如果出现MOLECULAR WEIGHT,则提取下一个值 if 'WHO NUMBER' in the_list_data[count]: value=the_list_data[count+1] if type(eval(value)) == int: return value return "" #匹配UNII正则表达式 def Re_UNII(str1): #{10}表示出现10次 re_UNII=re.compile(r'[A-Za-z0-9]{10}') mo1=re_UNII.search(str1) if mo1!=None: return True return False #提取UNII def Get_UNII(the_list_data): for count in range(len(the_list_data)): #如果出现MOLECULAR WEIGHT,则提取下一个值 if 'UNII' in the_list_data[count]: value=the_list_data[count+1] if Re_UNII(value)==True: return value return "" #获取me_down数据 def Get_me_down(the_list_data): name=frontName+pdfFilename return name pdfFile = open(pdfFilename, 'rb') outputString = readPDF(pdfFile) list_data=Format(outputString) me_source=2016 #提取me_usan,药品名 me_usan=Get_me_usan(list_data) #提取me_therapeutic 治疗疾病 me_therapeutic=Get_me_therapeutic(list_data) #提取me_therapeutic me_chemical1=Get_me_chemical1(list_data) #提取me_chemical2 分子式2 me_chemical2=Get_me_chemical2(list_data) #提取me_mo_formula,特征包含碳氢CH元素 me_mo_formula=Get_me_mo_formula(list_data) #提取分子质量me_mo_weight me_mo_weight=Get_me_mo_weight(list_data) #商标名数据库 list_trademarks=Get_txt_contents(trademark_filename) #提取商标名 me_trademark=Get_me_trademark(list_data) #赞助商数据库 list_sponsors=Get_txt_contents(sponsor_filename) #提取赞助商,新公司则找不到 me_sponsor=Get_me_sponsor(list_data) #提取CAS me_CAS=Get_CAS(list_data) #提取WHO me_WHO=Get_WHO(list_data) #提取UNII me_UNII=Get_UNII(list_data) #获取me_down me_down=Get_me_down(list_data) #me_bianma数据默认为空 me_bianma="" #me_ylbm数据默认为空 me_ylbm=""

  • 相关阅读:
    IE浏览器请求数据是提示下载的问题
    jS清除浏览器缓存
    JS获取时间戳
    keycode
    JS简单解决并发量
    写移动端流氓方法,无意看到,分享下
    CSS中的rem的换算
    jsp会话监听
    jsonp在jsp中的使用
    Java中的位运算符
  • 原文地址:https://www.cnblogs.com/webRobot/p/6225279.html
Copyright © 2011-2022 走看看