zoukankan      html  css  js  c++  java
  • 爬取genome的网页和图片

    # -*- coding: utf-8 -*-
    # @Time    : 2018/03/08 10:32
    # @Author  : cxa
    # @File    : gethtmlandimg.py
    # @Software: PyCharm
    
    import requests
    from fake_useragent import UserAgent as UA
    from lxml import html
    import traceback
    import os
    
    url = "http://www.genome.jp/kegg-bin/show_pathway?1520394169137283/hsa01100.args"
    html_path = os.path.join(os.getcwd(), url.split("/")[-1].replace("args", "html"))
    img_path = os.path.join(os.getcwd(), url.split("/")[-1].replace("args", "png"))
    headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
               'Accept - Encoding': 'gzip, deflate',
               'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
               'Connection': 'Keep-Alive',
               'User-Agent': UA().random}
    img_xapth = "//div[@class='map']/div[@class='image']/img[@name='pathwayimage']/@src"
    main_url = "http://www.genome.jp"
    
    
    def get_img(buff):
        with open(img_path, "wb") as fs:
            fs.write(buff)
    
    
    req = requests.get(url, timeout=20, headers=headers)
    try:
        if req.status_code == requests.codes.ok:
            get_html = req.text
            root = html.fromstring(get_html)
            imgurl = main_url + root.xpath(img_xapth)[0]
            with open(html_path, "w") as fs:
                fs.write(get_html.replace(root.xpath(img_xapth)[0],"./{}".format(url.split("/")[-1].replace("args", "png"))))
    
            img_req = requests.get(imgurl, headers=headers)
            if img_req.status_code == requests.codes.ok:
                buff = img_req.content
                get_img(buff)
            else:
                img_req.raise_for_status()
        else:
            req.raise_for_status()
    except:
        print(traceback.format_exc())
    

      

  • 相关阅读:
    「USACO 2020 US Open Platinum」Exercise
    Equilateral Triangles
    [USACO 2020 February Platinum]Help Yourself
    「ICPC World Finals 2019」美丽的桥梁
    「ICPC World Finals 2019 何以伊名始
    COCI20162017 Contest#6 F
    COCI2016/2017 Contest#3 F Meksikanac
    TopCoder SRM 570 Div1 CurvyonRails
    COCI2016-2017 Contest#2 F
    UOJ Round Good Bye JiHai D. 新年的追逐战
  • 原文地址:https://www.cnblogs.com/c-x-a/p/8526679.html
Copyright © 2011-2022 走看看