zoukankan      html  css  js  c++  java
  • http://www.etymon.cn/yingyucigen/3093.html

    import requests
    import lxml.etree as etree
    import xml.etree.ElementTree as ET
    
    # 详情页
    # 3093-148
    # http://www.etymon.cn/yingyucigen/148.html
    def getTgePageDetails():
        for i in range(148, 153):
            url = "http://www.etymon.cn/yingyucigen/" + str(i) + ".html"
            data = {
                'Cookie': "__51cke__=; __tins__16789340=%7B%22sid%22%3A%201578926662444%2C%20%22vd%22%3A%2040%2C%20%22expires%22%3A%201578929234076%7D; __51laig__=40",
                'Referer': "http://www.etymon.cn/yingyucigen/list_1_37.html",
                'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"}
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36"
            }
    
            res = requests.get(url, data=data, headers=headers)
            # print(res.content.decode("utf-8"))
            # 保存到文件中
            # with open("./词根.html", 'w', encoding="utf-8") as fp:
            #     fp.write(res.content.decode("utf-8"))
    
            # 解析
            html = etree.HTML(res.text)
    
            # 词根标题
            cigen_list = html.xpath('//*[@id="dictionary"]/dl/dt/h1')[0] #'//'表示获取当前节点子孙节点,'*'表示所有节点,'//*'表示获取当前节点下所有节点
            h1 = cigen_list.xpath('string(.)').strip()
            print(cigen_list)
            print(h1)
            # 保存
            with open("./cigen.html", 'a+', encoding="utf-8") as fp:
                print(i)
                fp.write(str(i))
    
    
            # # 正文
            # cigen_list = html.xpath('//*[@id="dictionary"]/dl/dd')[0]#'//'表示获取当前节点子孙节点,'*'表示所有节点,'//*'表示获取当前节点下所有节点
            # h1 = cigen_list.xpath('string(.)').strip()
            # print(cigen_list)
            # print(h1)
            # # 保存
            # with open("./cigen.html", 'a+', encoding="utf-8") as fp:
            #     print(i)
            #     fp.write(str(i))
            # #
            #
            #
            # # 保存
            # with open("./词根_处理完毕的.html", 'w', encoding="utf-8") as fp:
            #     fp.write(h1)
    
    if __name__ == '__main__':
        getTgePageDetails()
    

      

  • 相关阅读:
    idea jsp无法加载<c:foreach>循环遍历数据
    java POI读取Excel文件
    Javaweb中请求的资源[/Servlet]不可用解决方案
    大作业第一阶段冲刺(一)
    hive中sql左外连接查询列值为null
    关于ECharts在jsp页面无法显示的问题
    echarts通过ajax实现数据加载
    读书笔记
    解决:[Err] 1064
    idea启动Tomcat报错Artifact testdemo1:war exploded: Error during artifact deployment. See server log for details.问题解决
  • 原文地址:https://www.cnblogs.com/hellangels333/p/12190075.html
Copyright © 2011-2022 走看看