zoukankan      html  css  js  c++  java
  • 阳光高考的问题

    import requests
    import time
    from lxml import etree

    def get_html(url): # 请求页面
    try:
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
    res = requests.get(url, headers = headers)
    res.encoding = res.apparent_encoding
    if res.status_code == 200:
    html = res.text
    return html
    else:
    time.sleep(0.1)
    return get_html(url)
    except Exception as e: # except BaseException 这个也可以 e是打印出错误的原因
    print("问题是", e)
    pass

    def parse(html):
    #print(html)
    r = etree.HTML(html)

    #装详细列表的url
    list_detail = []

    #基础的url
    base_url = 'https://gaokao.chsi.com.cn'

    #一个列表页面的信息
    ppp_ = r.xpath("//div[@class='yxk-table']//text()")

    #拿到所有的半截url
    list_url = r.xpath("//div[@class='yxk-table']//td[@class='js-yxk-yxmc']/a/@href")

    #拼接所有的url
    for url in list_url:
    detail_url = base_url + url
    list_detail.append(detail_url)

    #返回所有的url
    return list_detail
    def url_join():

    url_start = 'https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-'
    url_end = '.dhtml'
    url_list = []
    for i in range(1,139):
    url_num = 20 * i - 20
    url = url_start + str(url_num) + url_end
    url_list.append(url)

    return url_list
    if __name__ == '__main__':
    # url_list = url_join()
    # print(url_list)
    #
    # for url in url_list:
    #
    # #访问
    # html = get_html(url)
    # parse(html)

    url = 'https://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-0.dhtml'
    html = get_html(url)
    url_list = parse(html)
    print(url_list)
  • 相关阅读:
    Python---HTML常用标签
    Python---进阶---Tkinter---game
    工程师的URL大全
    docker安装小笔记
    SQL server查询语句
    非常好用的sersync同步工具
    运维自动化的标准
    使用ansible实现轻量级的批量主机管理
    emos邮件系统的web密码修改方法
    Linux 之 rsyslog+mysql+LogAnalyzer 日志收集系统
  • 原文地址:https://www.cnblogs.com/yuanjia8888/p/11113859.html
Copyright © 2011-2022 走看看