zoukankan      html  css  js  c++  java
  • lxml_time_代理

      1 import requests
      2 from pyquery import PyQuery as pq
      3 import json
      4 import jsonpath
      5 from lxml import etree
      6 import os
      7 import re
      8 import time
      9 
     10 html = '''
     11 <div>
     12     <ul>
     13          <li class="item-0">first item</li>
     14          <li class="item-1"><a href="link2.html">second item</a></li>
     15          <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
     16          <li class="item-1 active"><a href="link4.html">fourth item</a></li>
     17          <li class="item-0"><a href="link5.html">fifth item</a></li>
     18      </ul>
     19 </div>
     20 '''
     21 
     22 # html = requests.get('http://news.4399.com/gonglue/lscs/kptj/').content.decode('gbk')
     23 num = 0
     24 # def pq方法(url):
     25 #     global num
     26 #     html=requests.get(url).content.decode('gbk')
     27 #     doc = pq(html)
     28 #     items = doc('#dq_list > li').items()
     29 #     # print(doc)
     30 #     # print(type(doc))
     31 #     for item in items:
     32 #         url=item.find('img').attr('lz_src')
     33 #         num+=1
     34 #         print(str(num),url)
     35 #         url_content=requests.get(url).content
     36 #         name = item.find('.kp-name').text()
     37         
     38 #         with open('e:/py3/002/'+'{:0>4}'.format(str(num))+name+'.jpg','wb') as file:
     39 #             file.write(url_content)
     40 #         # print(url,name)
     41 
     42 def transformCodec(re_data):#ascii (gbk) 转 unicode
     43     try:
     44         re_data = re_data.decode('gbk')
     45     except Exception as error:
     46         print (error)
     47         print ('delete illegal string,try again...')
     48         
     49         pos = re.findall(r'decodebytesinposition([d]+)-([d]+):illegal',str(error).replace(' ',''))
     50         if len(pos)==1:
     51             re_data = re_data[0:int(pos[0][0])]+re_data[int(pos[0][1]):]
     52             re_data = transformCodec(re_data)
     53             return re_data
     54     return re_data
     55 
     56 
     57 def lxml方法(url):
     58     global num
     59     header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2473.400'}
     60     content=requests.get(url,headers=header).content
     61     html=content.decode('utf-8')
     62     # print(html.status_code)
     63     # print(content)
     64     # print(html)    
     65     r=etree.HTML(html)
     66     # items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a/img/@lz_src")
     67     items=r.xpath("//div[@id='list']/table//tr")
     68     # print(items)
     69     for item in items:
     70         dl_ip=item.xpath("./td[1]/text()")
     71         dl_port=item.xpath("./td[2]/text()")
     72         dl_name=item.xpath("./td[5]/text()")
     73         num+=1
     74         dl_ip=dl_ip[0]+":" if len(dl_ip)>=1 else ''
     75         dl_port=dl_port[0]+"#" if len(dl_port)>=1 else ''
     76         dl_name=dl_name[0] if len(dl_name)>=1 else ''
     77         
     78         # print(len(dl_ip))
     79         # print(dl_ip)
     80         # print(r'{}{}{}'.format(dl_ip,dl_port,dl_name))
     81         with open("proxy.txt",'a',encoding='utf-8') as file:
     82             file.write('{}{}{}
    '.format(dl_ip,dl_port,dl_name))
     83         # lzcontent=requests.get(lzsrc).content
     84         # with open('e:/py3/004/'+'{:0>4}'.format(str(num))+'_'+kpname+'.jpg','wb')as file:
     85         #     file.write(lzcontent)
     86 
     87 
     88 
     89 
     90 
     91 if __name__ == '__main__':
     92     with open("proxy.txt", 'w', encoding='utf-8') as file:
     93             file.write(str(time.localtime()[0])+'_'+str(time.localtime()[1])+'_'+str(time.localtime()[2])+'_采集:
    ')
     94     # url='https://www.kuaidaili.com/free/inha/1/'
     95     for i in range(1,11):
     96         print(''+str(i)+'次:
    ')
     97         url2 = r'https://www.kuaidaili.com/free/inha/'+str(i)+r'/'
     98         print(url2)
     99         lxml方法(url2)
    100         time.sleep(5)
    101 
    102     # header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2473.400'}
    103     # pq方法()
    104     # print(str(time.localtime()[0])+'_'+str(time.localtime()[1])+'_'+str(time.localtime()[2]))
    105     print(str(num)+' ok!')
    106 
    107 
    108     # 创建目录
    109     '''
    110     for dirnum in range(1,100):
    111         dirnum2='{:0>3}'.format(str(dirnum))
    112         mkpath="e:\py3\{}\".format(dirnum2)
    113         print(mkpath)
    114         print('已存在!') if os.path.exists(mkpath) else os.makedirs(mkpath)
    115     '''
  • 相关阅读:
    NX二次开发-UFUN UF_UI_add_to_class_sel将UDOTestClass类的显示名称加入到类选择对话框的类列表中
    NX二次开发-UFUN创建管道UF_MODL_create_tube
    NX二次开发-UFUN获得工作视图的tag UF_VIEW_ask_work_view
    SQLyog/Mysql怎么修改用户/root密码【转载】
    MySQL返回来的值都是字符串类型,还原每个字段【转载】
    NX二次开发-NX访问MySQL数据库(增删改查)
    NX二次开发-ug表达式函数ug_find_file读取当前prt所在路径【转发】
    QT界面开发-QProgressBar【转载】
    QT界面开发-使用new QComboBox添加触发事件
    QT界面开发-窗口滚动条【转发】
  • 原文地址:https://www.cnblogs.com/pscc/p/9866225.html
Copyright © 2011-2022 走看看