zoukankan      html  css  js  c++  java
  • python爬虫:爬取医药数据库drugbank

    这个是帮朋友做的,难点就是他们有一个反爬虫机制,用request一直不行,后面我就用selenium直接把网页copy下来,然后再来解析本地的html文件,就木有问题啦。

    现在看来,写得有点傻,多包涵。

    # -*- coding:utf-8 -*-

    import os
    import time
    import datetime
    import codecs
    from lxml import etree
    from selenium import webdriver
    import csv
    #控制编码,全英文网页,用不着
    # import sys
    # reload(sys)
    # sys.setdefaultencoding('utf-8')

    # # date格式转为string格式
    today = datetime.date.today()
    today_string = today.strftime('%Y-%m-%d')

    #通过浏览器得到网页页面--反反爬虫
    def html_getter(site,file_name):
    driver = webdriver.Firefox()
    # chromedriver = r'C:Program Files (x86)GoogleChromeApplicationchromedriver.exe'
    # os.environ['webdriver.chrome.driver'] = chromedriver
    # driver = webdriver.Chrome(chromedriver)
    driver.get(site)
    driver.maximize_window() # 将浏览器最大化显示
    time.sleep(5) # 控制间隔时间,等待浏览器反映
    # 保存页面
    source_code = driver.find_element_by_xpath("//*").get_attribute("outerHTML")
    f = codecs.open(file_name, 'w+', 'utf8')
    f.write(source_code)
    f.close()

    #打开保存在本地的html文件
    def file_html(file_name):
    f = open(file_name,'r')
    html = f.read()
    f.close()
    return html

    #写入csv,也可以有其他写入方式,这个地方就csv
    def csv_writer(ll):
    headers = ['drug','inter','snp_rs_id','Allele_name','Defining_change','Adverse_Reaction','ref','href','original_title']
    with open('drugbank.csv','a') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(ll)

    #xpath解析网页,得到表格数据,我就是这么爱xpath,不喜欢正则表达式
    def data_get(html):
    selector = etree.HTML(html)
    tbody=selector.xpath('/html/body/main/table/tbody/tr')
    for each in tbody:
    # #1.'drug'
    drug_name=each.xpath('td[1]/strong/text()')[0]
    drug_sn=each.xpath('td[1]/a/text()')[0]
    drug=drug_name+' '+drug_sn
    # #print(drug)
    # #2.'Interacting Gene/Enzyme'
    int=each.xpath('td[2]')[0]
    inter=int.xpath('string(.)')
    # print(inter)
    # #3.'SNP RS ID'
    snp=each.xpath('td[3]/a/text()')
    if snp:
    snp_rs_id=snp[0]
    else:
    snp_rs_id='Not Available '
    #print snp_rs_id
    #4.Allele name
    Allele=each.xpath('td[4]/text()')
    if Allele:
    Allele_name=Allele[0]
    else:
    Allele_name='Not Available '
    # #print Allele_name
    # #5.'Defining change'
    Defining=each.xpath('td[5]/text()')
    if Defining:
    Defining_change=Defining[0]
    else:
    Defining_change='Not Available '
    # print Defining_change
    # 6.'Adverse Reaction'
    Adverse=each.xpath('td[6]/text()')
    if Adverse:
    Adverse_Reaction=Adverse[0]
    else:
    Adverse_Reaction='Not Available '
    # print Adverse_Reaction
    #7.'Reference(s)'
    ref=each.xpath('td[7]/span/a/text()')[0]
    href=each.xpath('td[7]/span/a/@href')[0]
    original_title=each.xpath('td[7]/span/a/@data-original-title')[0]
    # print ref
    # print(href)
    # print(original_title)

    tt=(drug,inter,snp_rs_id,Allele_name,Defining_change,Adverse_Reaction,ref,href,original_title)
    ll.append(tt)

    #print ll



    if __name__ == '__main__':
    ll=[]
    for i in range(1,5):
    page_num=i
    site='http://www.drugbank.ca/genobrowse/snp-adr?page='+str(page_num)
    #get the html through webdriver
    file_name=unicode(today_string)+u'drugbank_'+unicode(str(page_num))+u'.html'

    html_getter(site,file_name)
    html=file_html(file_name)
    data_get(html)
    csv_writer(ll)

  • 相关阅读:
    锚的应用
    有关于MP3音频文件的编码解码资料吗
    自定义web.config配置节 (转)
    HTC 文件
    Asp.Net音频文件上传和播放
    dotnet下用c#编写下载器
    自动滚屏代码
    agsXMPP分析:agsXMPP Namespace
    Socket网络编程学习笔记(1)
    (♂)程序打包工具setup2go使用教程
  • 原文地址:https://www.cnblogs.com/miranda-tang/p/5508359.html
Copyright © 2011-2022 走看看