zoukankan      html  css  js  c++  java
  • 两段实际爬虫程序应用

    import requests
    from  lxml   import  html
    etree = html.etree
    from  bs4 import  BeautifulSoup
    url = "https://mp.weixin.qq.com/s/drle9K4jgVWxm4v14ETbpQ"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, features='html.parser')
    content = soup.prettify()
    html_content = etree.HTML(content)
    #//*[@id="js_content"]/section[2]/section/section[2]/table
    ret_data = html_content.xpath('//tr')
    for  item in ret_data:
        con = item.xpath("./td[1]/text()")
        con1 =item.xpath("./td[2]/text()")
        print(con[0].strip("
    ").strip(" ").strip("
    ") + con1[0].strip("
    ").strip(" "))
    

     

    import requests
    from  lxml   import  html
    etree = html.etree
    from  bs4 import  BeautifulSoup
    url = "https://mp.weixin.qq.com/s/Zt2K7aOfSr8mrSdArfzWAg"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, features='html.parser')
    content = soup.prettify()
    html_content = etree.HTML(content)
    #//*[@id="js_content"]/section[2]/section/section[2]/table
    #//*[@id="js_content"]/section[2]/section/section[2]/section[2]/section/section/table/tbody/tr[6]/td[1]/p/span[1]
    ret_data = html_content.xpath('//tr')
    for  item in ret_data:
        con = item.xpath("./td[1]/p/span[1]/text()")
        con1 = item.xpath("./td[1]/p/span[2]/text()")
        con2 =item.xpath("./td[2]/p/text()")
        con3 =item.xpath("./td[2]/p/span/text()")
        # print(con)
        # print(con1)
        # print(con2[0].strip("
    ").strip(" ").strip("
    "))
        # print(con3[0].strip("
    ").strip(" ").strip("
    "))
        if con1:
            print(con[0].strip("
    ").strip(" ").strip("
    ")+con1[0].strip("
    ").strip(" ").strip("
    ")+ "      答案→" +con2[0].strip("
    ").strip(" ").strip("
    ") + con3[0].strip("
    ").strip(" ").strip("
    ") )
        else :
            print(con[0].strip("
    ").strip(" ").strip("
    ") )
        # print( con1[0].strip("
    ").strip(" "))
        # print(con)
        # print(con1)

     

    下载csv文件,并获取其内容

    import csv
    import requests
    
    CSV_URL = 'https://www.remedy-cloud.com/download/csv/CVE-2020-1938'
    
    with requests.Session() as s:
        download = s.get(CSV_URL)
        decoded_content = download.content.decode('utf-8')
        cr = csv.reader(decoded_content.splitlines(), delimiter=',')
        my_list = list(cr)
        for row in my_list:
            print(row)
    

      

  • 相关阅读:
    Python模糊查询本地文件夹去除文件后缀(7行代码)
    Python正则表达式
    python的logging模块
    Python中hashlib模块
    Python的os模块
    项目初始化mysql建库和授权
    Add correct host key in /root/.ssh/known_hosts to get rid of this message
    高中典型的等比数学题
    autoenv的使用方法
    celery任务进程关闭
  • 原文地址:https://www.cnblogs.com/weidaijie/p/14118768.html
Copyright © 2011-2022 走看看