zoukankan      html  css  js  c++  java
  • Python解析xml文档实战案例

    xml文档

    <?xml version="1.0" ?>
    <!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2019//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd">
    <PubmedArticleSet>
    <PubmedArticle>
        <MedlineCitation Status="MEDLINE" Owner="NLM">
            <PMID Version="1">28901317</PMID>
            <DateCompleted>
                <Year>2018</Year>
                <Month>05</Month>
                <Day>10</Day>
            </DateCompleted>
            <DateRevised>
                <Year>2018</Year>
                <Month>12</Month>
                <Day>02</Day>
            </DateRevised>
            <Article PubModel="Print">
                <Journal>
                    <ISSN IssnType="Electronic">1998-4138</ISSN>
                    <JournalIssue CitedMedium="Internet">
                        <Volume>13</Volume>
                        <Issue>4</Issue>
                        <PubDate>
                            <Year>2017</Year>
                        </PubDate>
                    </JournalIssue>
                    <Title>Journal of cancer research and therapeutics</Title>
                    <ISOAbbreviation>J Cancer Res Ther</ISOAbbreviation>
                </Journal>
                <ArticleTitle><i>k-RAS</i> mutation and resistance to epidermal growth factor receptor-tyrosine kinase inhibitor treatment in patients with nonsmall cell lung cancer.</ArticleTitle>
                <Pagination>
                    <MedlinePgn>699-701</MedlinePgn>
                </Pagination>
                <ELocationID EIdType="doi" ValidYN="Y">10.4103/jcrt.JCRT_468_17</ELocationID>
                <Abstract>
                    <AbstractText Label="OBJECTIVE" NlmCategory="OBJECTIVE">The aim of this study was to evaluate the relationship between k-RAS gene mutation and the resistance to epidermal growth factor receptor-tyrosine kinase inhibitor (EGFR-TKI) treatment in patients with nonsmall-cell lung cancer (NSCLC).</AbstractText>
                    <AbstractText Label="METHODS" NlmCategory="METHODS">Forty-five pathologies confirmed NSCLC patients who received EGFR-TKI (Gefitinib) treatment were retrospectively included in this study. The mutation of codon 12 and 13, located in exon1 and exon 2 of k-RAS gene were examined by polymerase chain reaction (PCR) and DAN sequencing in tumor samples of the included 45 NSCLC patients. The correlation between Gefitinib treatment response and k-RAS mutation status was analyzed in tumor samples of the 45 NSCLC patients.</AbstractText>
                    <AbstractText Label="RESULTS" NlmCategory="RESULTS">Eight tumor samples of the 45 NSCLC patients were found to be mutated in coden 12 or 13, with an mutation rate of 17.8% (8/45); the objective response rate (ORR) was 29.7%(11/37) with 1 cases of complete response (CR) and 10 cases of partial response in k-RAS mutation negative patients. Furthermore, the ORR was 0.0% in k-RAS mutation positive patients with none CR. The ORR between k-RAS mutation and nonmutation patients were significant different (P < 0.05).</AbstractText>
                    <AbstractText Label="CONCLUSION" NlmCategory="CONCLUSIONS">k-RAS gene mutation status was associated with the response of Gefitinib treatment in patients with NSCLC.</AbstractText>
                </Abstract>
                <AuthorList CompleteYN="Y">
                    <Author ValidYN="Y">
                        <LastName>Zhou</LastName>
                        <ForeName>Bin</ForeName>
                        <Initials>B</Initials>
                        <AffiliationInfo>
                            <Affiliation>Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China.</Affiliation>
                        </AffiliationInfo>
                    </Author>
                    <Author ValidYN="Y">
                        <LastName>Tang</LastName>
                        <ForeName>Congrong</ForeName>
                        <Initials>C</Initials>
                        <AffiliationInfo>
                            <Affiliation>Department of Pharmacy, The First Affiliated Hospital of Wenzhou Medical University, Wenzhou, Zhejiang, Province 325200, PR China.</Affiliation>
                        </AffiliationInfo>
                    </Author>
                    <Author ValidYN="Y">
                        <LastName>Li</LastName>
                        <ForeName>Jie</ForeName>
                        <Initials>J</Initials>
                        <AffiliationInfo>
                            <Affiliation>Department of Pharmacy, Ruian People's Hospital, Ruian, Zhejiang, Province 325200, PR China.</Affiliation>
                        </AffiliationInfo>
                    </Author>
                </AuthorList>
                <Language>eng</Language>
                <PublicationTypeList>
                    <PublicationType UI="D016428">Journal Article</PublicationType>
                </PublicationTypeList>
            </Article>
            <MedlineJournalInfo>
                <Country>India</Country>
                <MedlineTA>J Cancer Res Ther</MedlineTA>
                <NlmUniqueID>101249598</NlmUniqueID>
                <ISSNLinking>1998-4138</ISSNLinking>
            </MedlineJournalInfo>
            <ChemicalList>
                <Chemical>
                    <RegistryNumber>0</RegistryNumber>
                    <NameOfSubstance UI="C117307">KRAS protein, human</NameOfSubstance>
                </Chemical>
                <Chemical>
                    <RegistryNumber>0</RegistryNumber>
                    <NameOfSubstance UI="D047428">Protein Kinase Inhibitors</NameOfSubstance>
                </Chemical>
                <Chemical>
                    <RegistryNumber>0</RegistryNumber>
                    <NameOfSubstance UI="D011799">Quinazolines</NameOfSubstance>
                </Chemical>
                <Chemical>
                    <RegistryNumber>EC 2.7.10.1</RegistryNumber>
                    <NameOfSubstance UI="C512478">EGFR protein, human</NameOfSubstance>
                </Chemical>
                <Chemical>
                    <RegistryNumber>EC 2.7.10.1</RegistryNumber>
                    <NameOfSubstance UI="D066246">ErbB Receptors</NameOfSubstance>
                </Chemical>
                <Chemical>
                    <RegistryNumber>EC 3.6.5.2</RegistryNumber>
                    <NameOfSubstance UI="D016283">Proto-Oncogene Proteins p21(ras)</NameOfSubstance>
                </Chemical>
                <Chemical>
                    <RegistryNumber>S65743JHBS</RegistryNumber>
                    <NameOfSubstance UI="D000077156">Gefitinib</NameOfSubstance>
                </Chemical>
            </ChemicalList>
            <CitationSubset>IM</CitationSubset>
            <MeshHeadingList>
                <MeshHeading>
                    <DescriptorName UI="D000328" MajorTopicYN="N">Adult</DescriptorName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D000368" MajorTopicYN="N">Aged</DescriptorName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D002289" MajorTopicYN="N">Carcinoma, Non-Small-Cell Lung</DescriptorName>
                    <QualifierName UI="Q000188" MajorTopicYN="Y">drug therapy</QualifierName>
                    <QualifierName UI="Q000235" MajorTopicYN="N">genetics</QualifierName>
                    <QualifierName UI="Q000473" MajorTopicYN="N">pathology</QualifierName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D019008" MajorTopicYN="N">Drug Resistance, Neoplasm</DescriptorName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D066246" MajorTopicYN="N">ErbB Receptors</DescriptorName>
                    <QualifierName UI="Q000037" MajorTopicYN="N">antagonists & inhibitors</QualifierName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D005260" MajorTopicYN="N">Female</DescriptorName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D000077156" MajorTopicYN="N">Gefitinib</DescriptorName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D006801" MajorTopicYN="N">Humans</DescriptorName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D008297" MajorTopicYN="N">Male</DescriptorName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D008875" MajorTopicYN="N">Middle Aged</DescriptorName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D009154" MajorTopicYN="N">Mutation</DescriptorName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D047428" MajorTopicYN="N">Protein Kinase Inhibitors</DescriptorName>
                    <QualifierName UI="Q000008" MajorTopicYN="Y">administration & dosage</QualifierName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D016283" MajorTopicYN="N">Proto-Oncogene Proteins p21(ras)</DescriptorName>
                    <QualifierName UI="Q000235" MajorTopicYN="Y">genetics</QualifierName>
                </MeshHeading>
                <MeshHeading>
                    <DescriptorName UI="D011799" MajorTopicYN="N">Quinazolines</DescriptorName>
                    <QualifierName UI="Q000008" MajorTopicYN="Y">administration & dosage</QualifierName>
                </MeshHeading>
            </MeshHeadingList>
        </MedlineCitation>
        <PubmedData>
            <History>
                <PubMedPubDate PubStatus="entrez">
                    <Year>2017</Year>
                    <Month>9</Month>
                    <Day>14</Day>
                    <Hour>6</Hour>
                    <Minute>0</Minute>
                </PubMedPubDate>
                <PubMedPubDate PubStatus="pubmed">
                    <Year>2017</Year>
                    <Month>9</Month>
                    <Day>14</Day>
                    <Hour>6</Hour>
                    <Minute>0</Minute>
                </PubMedPubDate>
                <PubMedPubDate PubStatus="medline">
                    <Year>2018</Year>
                    <Month>5</Month>
                    <Day>11</Day>
                    <Hour>6</Hour>
                    <Minute>0</Minute>
                </PubMedPubDate>
            </History>
            <PublicationStatus>ppublish</PublicationStatus>
            <ArticleIdList>
                <ArticleId IdType="pubmed">28901317</ArticleId>
                <ArticleId IdType="pii">JCanResTher_2017_13_4_699_214476</ArticleId>
                <ArticleId IdType="doi">10.4103/jcrt.JCRT_468_17</ArticleId>
            </ArticleIdList>
        </PubmedData>
    </PubmedArticle>
    
    </PubmedArticleSet>

      方法一:xml.etree.cElementTre

    # -*- coding: utf-8 -*-
    
    """
    @Datetime: 2019/4/25
    @Author: Zhang Yafei
    """
    import os
    import re
    import threading
    import xml.etree.cElementTree as ET
    from concurrent.futures import ThreadPoolExecutor
    from itertools import chain
    
    import pandas as pd
    
    
    def pubmed_xml_parser(path):
        dir_name = path.split('\')[0]
        print(dir_name)
        etree = ET.parse(path)
        root = etree.getroot()
        data_list = []
        pmid_set = []
        for articles in root.iter('PubmedArticle'):
            pmid = articles.find('MedlineCitation').find('PMID').text
            if pmid in pmid_set:
                continue
            pmid_set.append(pmid)
            Article = articles.find('MedlineCitation').find('Article')
            journal = Article.find('Journal').find('ISOAbbreviation').text
            try:
                authors = Article.find('AuthorList').findall('Author')
                affiliations_info = set()
                for author in authors:
                    # author_name = author.find('LastName').text + ' ' + author.find('ForeName').text
                    affiliations = [x.find('Affiliation').text for x in author.findall('AffiliationInfo')]
                    # author = author_name + ':' + ';'.join(affiliations)
                    for affiliation in affiliations:
                        affiliations_info.add(affiliation)
                affiliations_info = ';'.join(affiliations_info)
            except AttributeError:
                affiliations_info = ''
            try:
                date = Article.find('Journal').find('JournalIssue').find('PubDate').find('Year').text
            except AttributeError:
                date = Article.find('Journal').find('JournalIssue').find('PubDate').find('MedlineDate').text
                date = re.search('d+', date).group(0)
            try:
                mesh_words = []
                for mesh_heading in articles.find('MedlineCitation').find('MeshHeadingList').findall('MeshHeading'):
                    if len(list(mesh_heading)) == 1:
                        mesh_words.append(list(mesh_heading)[0].text)
                        continue
                    mesh_name = ''
                    for mesh in mesh_heading:
                        if mesh.tag == 'DescriptorName':
                            mesh_name = mesh.text
                            continue
                        if mesh_name and mesh.tag == 'QualifierName':
                            mesh_word = mesh_name + '/' + mesh.text
                            mesh_words.append(mesh_word)
                mesh_words = ';'.join(mesh_words)
            except AttributeError:
                print(articles.find('MedlineCitation').find('PMID').text)
                mesh_words = ''
            article_type = '/'.join([x.text for x in Article.find('PublicationTypeList').getchildren()])
            country = articles.find('MedlineCitation').find('MedlineJournalInfo').find('Country').text
            data_list.append(
                {'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date,
                 'mesh_words': mesh_words,
                 'country': country, 'article_type': article_type, 'file_path': path})
            print(pmid + '	解析完成')
        df = pd.DataFrame(data_list)
        with threading.Lock():
            df.to_csv('{}.csv'.format(dir_name), encoding='utf_8_sig', mode='a', index=False, header=False)
    
    
    def to_excel(data, path):
        writer = pd.ExcelWriter(path)
        data.to_excel(writer, sheet_name='table', index=False)
        writer.save()
    
    
    def get_files_path():
        for base_path, folders, files in os.walk('first in class drug'):
            file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')]
        for base_path, folders, files in os.walk('follow on drug'):
            file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
        for base_path, folders, files in os.walk('me too drug'):
            file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
        if os.path.exists('first in class drug.csv') or os.path.exists('follow on drug.csv') or os.path.exists(
                'me too drug.csv'):
            if os.path.exists('first in class drug.csv'):
                df = pd.read_csv('first in class drug.csv', encoding='utf-8')
                has_files_list = df.file_path.tolist()
            if os.path.exists('follow on drug.csv'):
                df = pd.read_csv('follow on drug.csv', encoding='utf-8')
                has_files_list = chain(has_files_list, df.file_path.tolist())
            if os.path.exists('me too drug.csv'):
                df = pd.read_csv('me too drug.csv', encoding='utf-8')
                has_files_list = chain(has_files_list, df.file_path.tolist())
            print('共需解析文件:{0}'.format(len(file_list)))
            has_files_list = set(has_files_list)
            file_list = set(file_list) - has_files_list
            print('已解析文件:{0}'.format(len(has_files_list)))
        else:
            df = pd.DataFrame(
                columns=['PMID', 'affiliations_info', 'article_type', 'country', 'file_path', 'journal', 'mesh_words',
                         'pub_year'])
            df.to_csv('follow on drug.csv', encoding='utf_8_sig', index=False)
            df.to_csv('first in class drug.csv', encoding='utf_8_sig', index=False)
            df.to_csv('me too drug.csv', encoding='utf_8_sig', index=False)
            print('共需解析文件:{0}'.format(len(file_list)))
            print('已解析文件:0')
        return file_list
    
    
    if __name__ == '__main__':
        files_list = get_files_path()
        if not files_list:
            print('全部解析完成')
        else:
            with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
                pool.map(pubmed_xml_parser, files_list)

      方法二:lxml+xpath

    # -*- coding: utf-8 -*-
    
    """
    @Datetime: 2019/4/26
    @Author: Zhang Yafei
    """
    import os
    import re
    import threading
    from concurrent.futures import ThreadPoolExecutor
    
    from lxml import etree
    import pandas as pd
    
    
    def pubmed_xpath_parse(path):
        tree = etree.parse(path)
        # 如果xml数据中出现了关于dtd的声明(如下面的例子),那样的话,必须在使用lxml解析xml的时候,进行相应的声明。
        # parser = etree.XMLParser(load_dtd=True)  # 首先根据dtd得到一个parser(注意dtd文件要放在和xml文件相同的目录)
        # tree = etree.parse('1.xml', parser=parser)  # 用上面得到的parser将xml解析为树结构
        data_list = []
        pmid_set = []
        for articles in tree.xpath('//PubmedArticle'):
            # pmid = articles.xpath('MedlineCitation/PMID')[0].xpath('string()')
            pmid = articles.xpath('MedlineCitation/PMID/text()')[0]
            if pmid in pmid_set:
                continue
            pmid_set.append(pmid)
            Article = articles.xpath('MedlineCitation/Article')[0]
            journal = Article.xpath('Journal/ISOAbbreviation/text()')[0]
            try:
                authors = Article.xpath('AuthorList/Author')
                affiliations_info = set()
                for author in authors:
                    # author_name = author.find('LastName').text + ' ' + author.find('ForeName').text
                    affiliations = [x.xpath('Affiliation/text()')[0] for x in author.xpath('AffiliationInfo')]
                    # author = author_name + ':' + ';'.join(affiliations)
                    for affiliation in affiliations:
                        affiliations_info.add(affiliation)
                affiliations_info = ';'.join(affiliations_info)
            except AttributeError:
                affiliations_info = ''
            try:
                date = Article.xpath('Journal/JournalIssue/PubDate/Year/text()')[0]
            except IndexError:
                date = Article.xpath('Journal/JournalIssue/PubDate/MedlineDate/text()')[0]
                date = re.search('d+', date).group(0)
            try:
                mesh_words = []
                for mesh_heading in articles.xpath('MedlineCitation/MeshHeadingList/MeshHeading'):
                    if len(mesh_heading.xpath('child::*')) == 1:
                        mesh_words.append((mesh_heading.xpath('child::*'))[0].text)
                        continue
                    mesh_name = ''
                    for mesh in mesh_heading.xpath('child::*'):
                        if mesh.tag == 'DescriptorName':
                            mesh_name = mesh.xpath('string()')
                            continue
                        if mesh_name and mesh.tag == 'QualifierName':
                            mesh_word = mesh_name + '/' + mesh.xpath('string()')
                            mesh_words.append(mesh_word)
                mesh_words = ';'.join(mesh_words)
            except AttributeError:
                mesh_words = ''
            article_type = '/'.join([x.xpath('./text()')[0] for x in Article.xpath('PublicationTypeList/PublicationType')])
            country = articles.xpath('MedlineCitation/MedlineJournalInfo/Country/text()')[0]
            data_list.append(
                {'PMID': pmid, 'journal': journal, 'affiliations_info': affiliations_info, 'pub_year': date,
                 'mesh_words': mesh_words,
                 'country': country, 'article_type': article_type, 'file_path': path})
            print(pmid + '	解析完成')
            df = pd.DataFrame(data_list)
            with threading.Lock():
                df.to_csv('pubmed.csv', encoding='utf_8_sig', mode='a', index=False, header=False)
    
    
    def to_excel(data, path):
        writer = pd.ExcelWriter(path)
        data.to_excel(writer, sheet_name='table', index=False)
        writer.save()
    
    
    def get_files_path():
        for base_path, folders, files in os.walk('first in class drug'):
            file_list = [os.path.join(base_path, file) for file in files if file.endswith('.xml')]
        for base_path, folders, files in os.walk('follow on drug'):
            file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
        for base_path, folders, files in os.walk('me too drug'):
            file_list.extend([os.path.join(base_path, file) for file in files if file.endswith('.xml')])
        if os.path.exists('pubmed.csv'):
            df = pd.read_csv('pubmed.csv', encoding='utf-8')
            has_files_list = df.file_path
            print('共需解析文件:{0}'.format(len(file_list)))
            file_list = set(file_list) - set(has_files_list)
            print('已解析文件:{0}'.format(len(set(has_files_list))))
        else:
            df = pd.DataFrame(columns=['PMID','affiliations_info','article_type','country','file_path','journal','mesh_words','pub_year'])
            df.to_csv('pubmed.csv', encoding='utf_8_sig', index=False)
            print('共需解析文件:{0}'.format(len(file_list)))
            print('已解析文件:0')
        return file_list
    
    
    if __name__ == '__main__':
        files_list = get_files_path()
        if not files_list:
            print('全部解析完成')
        else:
            pool = ThreadPoolExecutor(max_workers=os.cpu_count())
            pool.map(pubmed_xpath_parse, files_list)
    

      

      

  • 相关阅读:
    DevExpress中的gridControl选择问题
    使用IL DASM来查看接口内的自动属性
    多态的使用 虚方法、抽象类、接口
    匿名类的使用及原理
    JAVA小问题解决办法
    数据库02
    数据库01(验证连接是否成功)
    Java 类库知识总结
    Java多线程知识总结(补充)
    java Thread和Runable的深刻理解
  • 原文地址:https://www.cnblogs.com/zhangyafei/p/10776698.html
Copyright © 2011-2022 走看看