zoukankan      html  css  js  c++  java
  • python爬虫:爬取医药数据库drugbank

    这个是帮朋友做的,难点就是他们有一个反爬虫机制,用request一直不行,后面我就用selenium直接把网页copy下来,然后再来解析本地的html文件,就木有问题啦。

    现在看来,写得有点傻,多包涵。

    # -*- coding:utf-8 -*-

    import os
    import time
    import datetime
    import codecs
    from lxml import etree
    from selenium import webdriver
    import csv
    #控制编码,全英文网页,用不着
    # import sys
    # reload(sys)
    # sys.setdefaultencoding('utf-8')

    # # date格式转为string格式
    today = datetime.date.today()
    today_string = today.strftime('%Y-%m-%d')

    #通过浏览器得到网页页面--反反爬虫
    def html_getter(site,file_name):
    driver = webdriver.Firefox()
    # chromedriver = r'C:Program Files (x86)GoogleChromeApplicationchromedriver.exe'
    # os.environ['webdriver.chrome.driver'] = chromedriver
    # driver = webdriver.Chrome(chromedriver)
    driver.get(site)
    driver.maximize_window() # 将浏览器最大化显示
    time.sleep(5) # 控制间隔时间,等待浏览器反映
    # 保存页面
    source_code = driver.find_element_by_xpath("//*").get_attribute("outerHTML")
    f = codecs.open(file_name, 'w+', 'utf8')
    f.write(source_code)
    f.close()

    #打开保存在本地的html文件
    def file_html(file_name):
    f = open(file_name,'r')
    html = f.read()
    f.close()
    return html

    #写入csv,也可以有其他写入方式,这个地方就csv
    def csv_writer(ll):
    headers = ['drug','inter','snp_rs_id','Allele_name','Defining_change','Adverse_Reaction','ref','href','original_title']
    with open('drugbank.csv','a') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(ll)

    #xpath解析网页,得到表格数据,我就是这么爱xpath,不喜欢正则表达式
    def data_get(html):
    selector = etree.HTML(html)
    tbody=selector.xpath('/html/body/main/table/tbody/tr')
    for each in tbody:
    # #1.'drug'
    drug_name=each.xpath('td[1]/strong/text()')[0]
    drug_sn=each.xpath('td[1]/a/text()')[0]
    drug=drug_name+' '+drug_sn
    # #print(drug)
    # #2.'Interacting Gene/Enzyme'
    int=each.xpath('td[2]')[0]
    inter=int.xpath('string(.)')
    # print(inter)
    # #3.'SNP RS ID'
    snp=each.xpath('td[3]/a/text()')
    if snp:
    snp_rs_id=snp[0]
    else:
    snp_rs_id='Not Available '
    #print snp_rs_id
    #4.Allele name
    Allele=each.xpath('td[4]/text()')
    if Allele:
    Allele_name=Allele[0]
    else:
    Allele_name='Not Available '
    # #print Allele_name
    # #5.'Defining change'
    Defining=each.xpath('td[5]/text()')
    if Defining:
    Defining_change=Defining[0]
    else:
    Defining_change='Not Available '
    # print Defining_change
    # 6.'Adverse Reaction'
    Adverse=each.xpath('td[6]/text()')
    if Adverse:
    Adverse_Reaction=Adverse[0]
    else:
    Adverse_Reaction='Not Available '
    # print Adverse_Reaction
    #7.'Reference(s)'
    ref=each.xpath('td[7]/span/a/text()')[0]
    href=each.xpath('td[7]/span/a/@href')[0]
    original_title=each.xpath('td[7]/span/a/@data-original-title')[0]
    # print ref
    # print(href)
    # print(original_title)

    tt=(drug,inter,snp_rs_id,Allele_name,Defining_change,Adverse_Reaction,ref,href,original_title)
    ll.append(tt)

    #print ll



    if __name__ == '__main__':
    ll=[]
    for i in range(1,5):
    page_num=i
    site='http://www.drugbank.ca/genobrowse/snp-adr?page='+str(page_num)
    #get the html through webdriver
    file_name=unicode(today_string)+u'drugbank_'+unicode(str(page_num))+u'.html'

    html_getter(site,file_name)
    html=file_html(file_name)
    data_get(html)
    csv_writer(ll)

  • 相关阅读:
    给出两个 非空 的链表用来表示两个非负的整数。其中,它们各自的位数是按照 逆序 的方式存储的,并且它们的每个节点只能存储 一位 数字。
    11
    实战 迁移学习 VGG19、ResNet50、InceptionV3 实践 猫狗大战 问题
    tx2系统备份与恢复
    如何在Ubuntu 18.04上安装和卸载TeamViewer
    bzoj 3732 Network (kruskal重构树)
    bzoj2152 聪聪可可 (树形dp)
    牛客 216D 消消乐 (二分图最小点覆盖)
    牛客 197E 01串
    Wannafly挑战赛23
  • 原文地址:https://www.cnblogs.com/miranda-tang/p/5508359.html
Copyright © 2011-2022 走看看