zoukankan      html  css  js  c++  java
  • 爬呀,列表有最大长度的哦

    # import requests
    # from lxml import etree
    # import time,random
    # header={
    # 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    # }
    # response=requests.get('http://xh.5156edu.com/kxbs.html',headers=header).content.decode('gbk')
    # print(response)
    # html = etree.HTML(response)
    # print(html.xpath('//table[@id="table1"]//a/text()'))
    # print(len(html.xpath('//table[@id="table1"]//a/text()')))
    # print(html.xpath('//table[@id="table1"]//a/@href'))
    # print(len(html.xpath('//table[@id="table1"]//a/@href')))
    # urllist=['http://xh.5156edu.com'+i for i in html.xpath('//table[@id="table1"]//a/@href')]
    # print(urllist)
    # print(len(urllist))
    # print(etree.tostring(html))
    # allli=[]
    # s=0
    # for i in urllist:
    # time.sleep(random.uniform(3,4))
    # response = requests.get(i, headers=header).content.decode('gbk')
    # html = etree.HTML(response)
    # nowlist = ['http://xh.5156edu.com' + i for i in html.xpath('//td[ @ width = "8%"]/a/@href')]
    # print(len(nowlist),nowlist)
    # s+=len(nowlist)
    # print(s)
    # allli+=nowlist
    # for i in nowlist:
    # with open('kxzd_urllist',mode='a',encoding='utf-8') as f:
    # f.write(i+' ')
    # print(len(allli),allli)


    # import requests
    # import time,random,os
    # import re
    # header={
    # 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    # }
    # su=0
    # de=0
    # with open('kxzd_urllist') as f:
    # for i in f:
    # s=i.strip()
    # time.sleep(random.uniform(1, 2))
    # try:
    # response=requests.get(s,headers=header).content.decode('gbk')
    # name = os.path.join(r'D:urllist', re.findall('<title>(.*?)</title>',response)[0] + '.html')
    # with open(name,mode='a',encoding='gbk') as f:
    # f.write(response)
    # with open('kxzd_urllist_complited', mode='a', encoding='gbk') as f:
    # f.write(s+' ')
    # su+=1
    # except:
    # with open('kxzd_urllist_uncomplited', mode='a', encoding='gbk') as f:
    # f.write(s+' ')
    # de+=1
    # print(su,de)



    # import requests
    # import time,random,os
    # import re
    # header={
    # 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    # }
    # su=0
    # de=0
    # with open('kxzd_urllist_uncomplited') as f:
    # for i in f:
    # s=i.strip()
    # time.sleep(random.uniform(1, 2))
    # try:
    # response=requests.get(s,headers=header).content.decode('GB18030')
    # name = os.path.join(r'D:urllist_utf-8', re.findall('<title>(.*?)</title>',response)[0] + '.html')
    # with open(name, mode='w', encoding='utf-8') as f:
    # f.write(response.replace('charset=gb2312', 'charset=utf-8'))
    # with open('kxzd_urllist_complited', mode='a', encoding='gbk') as f:
    # f.write(s+' ')
    # su+=1
    # except:
    # with open('kxzd_urllist_stilluncomplited', mode='a', encoding='gbk') as f:
    # f.write(s+' ')
    # de+=1
    # print(su,de)


    # import requests
    # import time,random,os
    # import re
    # header={
    # 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    # }
    # su=0
    # de=0
    # with open('kxzd_urllist_stilluncomplited') as f:
    # for i in f:
    # s=i.strip()
    # time.sleep(random.uniform(1, 2))
    # try:
    # response=requests.get(s,headers=header).content.decode('GB18030',errors='ignore')
    # name = os.path.join(r'D:urllist_decode_error', re.findall('<title>(.*?)</title>',response)[0] + '.html')
    # with open(name, mode='w', encoding='utf-8') as f:
    # f.write(response.replace('charset=gb2312', 'charset=utf-8'))
    # with open('kxzd_urllist_complited', mode='a', encoding='gbk') as f:
    # f.write(s+' ')
    # su+=1
    # except:
    # with open('kxzd_urllist_stillstilluncomplited', mode='a', encoding='gbk') as f:
    # f.write(s+' ')
    # de+=1
    # print(su,de)



    # from lxml import etree
    # import time,random,os
    # import re,requests
    # header={
    # 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    # }
    # response=requests.get('http://xh.5156edu.com/kx/z90m98j7521.html').content.decode('gbk')
    # print(response)
    # html = etree.HTML(response)
    # print(html)
    # s=html.xpath('//div/table/tbody/tr/td[@class="font_14"]|//div[@align="center"]/table/tr/td[@class="font_23"]')
    # # print(s)
    # m=etree.tostring(s[0],encoding='gbk',method="html")
    # print(m.decode('gbk'))



    # from lxml import etree
    # import time,random,os
    # import re,requests
    # li=os.listdir(r'C:UserslenovoDesktopurllist_gbk')
    # os.chdir(r'C:UserslenovoDesktopurllist_gbk')
    # for i in li:
    # with open(i,mode='r',encoding='gbk') as f:
    # response=f.read()
    # print(response)
    # try:
    # html = etree.HTML(response)
    # print(html)
    # s=html.xpath('//div[@align="center"]|//div[@align="center"]/table/tr/td[@class="font_23"]')
    # if not s:
    # print(s)
    # m=etree.tostring(s[0],encoding='gbk',method="html")
    # print('*'*20)
    # print(m.decode('gbk'))
    # s=m.decode('gbk')
    # name = os.path.join(r'C:UserslenovoDesktopfingbk',i)
    # with open(name, mode='w+', encoding='gbk') as f:
    # f.write(s)
    # except:
    # with open('xpath无效文件',mode='a')as f1 :
    # f1.write(i+' ')
    # os.remove(i)


    # from lxml import etree
    # import time,random,os
    # import re,requests
    # li=os.listdir(r'C:UserslenovoDesktopurllist_utf-8')
    # os.chdir(r'C:UserslenovoDesktopurllist_utf-8')
    # for i in li:
    # with open(i,mode='r',encoding='utf-8') as f:
    # response=f.read()
    # print(response)
    # try:
    # html = etree.HTML(response)
    # print(html)
    # s=html.xpath('//div[@align="center"]|//div[@align="center"]/table/tr/td[@class="font_23"]')
    # if not s:
    # print(s)
    # m=etree.tostring(s[0],encoding='utf-8',method="html")
    # print('*'*20)
    # print(m.decode('utf-8'))
    # s=m.decode('utf-8')
    # name = os.path.join(r'C:UserslenovoDesktopfinutf8',i)
    # with open(name, mode='w+', encoding='utf-8') as f:
    # f.write(s)
    # except:
    # with open('xpath无效文件',mode='a')as f1 :
    # f1.write(i+' ')
    # os.remove(i)


    # from lxml import etree
    # import time,random,os
    # import re,requests
    # li=os.listdir(r'C:UserslenovoDesktopurllist_decode_error')
    # os.chdir(r'C:UserslenovoDesktopurllist_decode_error')
    # for i in li:
    # with open(i,mode='r',encoding='utf-8',errors='ignore') as f:
    # response=f.read()
    # print(response)
    # try:
    # html = etree.HTML(response)
    # print(html)
    # s=html.xpath('//div[@align="center"]|//div[@align="center"]/table/tr/td[@class="font_23"]')
    # if not s:
    # print(s)
    # m=etree.tostring(s[0],encoding='gbk',method="html")
    # print('*'*20)
    # print(m.decode('gbk'))
    # s=m.decode('gbk')
    # name = os.path.join(r'C:UserslenovoDesktopfinerror',i)
    # with open(name, mode='w+', encoding='gbk') as f:
    # f.write(s)
    # except:
    # with open('xpath无效文件',mode='a')as f1 :
    # f1.write(i+' ')
    # os.remove(i)


    # import os
    # li=os.listdir(r'C:UserslenovoDesktopfinutf81')
    # os.chdir(r'C:UserslenovoDesktopfinutf81')
    # for i in li:
    # with open(i,mode='r',encoding='utf-8') as f:
    # response=f.read()
    # if '<br><b><span class="font_15">基本解释</span></b>' in response:
    # response=response.split('<br><b><span class="font_15">基本解释</span></b>')[0]
    # with open(i,mode='w',encoding='utf-8') as f:
    # f.write(response)

    # import os
    # li=os.listdir(r'C:UserslenovoDesktopfinerror1')
    # os.chdir(r'C:UserslenovoDesktopfinerror1')
    # for i in li:
    # with open(i,mode='r+',encoding='gbk') as f:
    # response=f.read()
    # if '<br><b><span class="font_15">基本解释</span></b>' in response:
    # response=response.split('<br><b><span class="font_15">基本解释</span></b>')[0]
    # with open(i,mode='w',encoding='utf-8') as f:
    # f.write(response)

    # import os
    # li=os.listdir(r'C:UserslenovoDesktopfingbk1')
    # os.chdir(r'C:UserslenovoDesktopfingbk1')
    # for i in li:
    # with open(i,mode='r+',encoding='gbk') as f:
    # response=f.read()
    # if '<br><b><span class="font_15">基本解释</span></b>' in response:
    # response=response.split('<br><b><span class="font_15">基本解释</span></b>')[0]
    # with open(i,mode='w',encoding='utf-8') as f:
    # f.write(response)





    # import os
    # li=os.listdir(r'C:UserslenovoDesktopfinutf81')
    # os.chdir(r'C:UserslenovoDesktopfinutf81')
    # for i in li:
    # with open(i,mode='r',encoding='utf-8') as f:
    # response=f.read()
    # if '/a>' in response:
    # response=response.replace('<a','<span').replace('</a>','</span>')
    # with open(i,mode='w',encoding='utf-8') as f:
    # f.write(response)
    # import os
    # li=os.listdir(r'C:UserslenovoDesktopfingbk1')
    # os.chdir(r'C:UserslenovoDesktopfingbk1')
    # for i in li:
    # with open(i,mode='r+',encoding='utf-8') as f:
    # response=f.read()
    # if '/a>' in response:
    # response=response.replace('<a','<span').replace('</a>','</span>')
    # with open(i,mode='w',encoding='utf-8') as f:
    # f.write(response)




    # import os
    # import re
    # li=os.listdir(r'C:UserslenovoDesktopfinutf81')
    # os.chdir(r'C:UserslenovoDesktopfinutf81')
    # for i in li:
    # with open(i,mode='r',encoding='utf-8') as f:
    # response=f.read()
    # with open('urlist_gif',mode='a',encoding='utf-8') as f:
    # li=re.findall('src="(.*?)">',response,re.S)
    # print(li)
    # for i in li:
    # f.write('http://xh.5156edu.com'+i+' ')

    # import os
    # import re
    # li=os.listdir(r'C:UserslenovoDesktopfinerror1')
    # os.chdir(r'C:UserslenovoDesktopfinerror1')
    # for i in li:
    # with open(i,mode='r',encoding='utf-8') as f:
    # response=f.read()
    # with open('urlist_gif',mode='a',encoding='utf-8') as f:
    # li=re.findall('src="(.*?)">',response,re.S)
    # print(li)
    # for i in li:
    # f.write('http://xh.5156edu.com'+i+' ')
    # import os
    # import re
    # li=os.listdir(r'C:UserslenovoDesktopfingbk1')
    # os.chdir(r'C:UserslenovoDesktopfingbk1')
    # for i in li:
    # with open(i,mode='r',encoding='utf-8') as f:
    # response=f.read()
    # with open('urlist_gif',mode='a',encoding='utf-8') as f:
    # li=re.findall('src="(.*?)">',response,re.S)
    # print(li)
    # for i in li:
    # f.write('http://xh.5156edu.com'+i+' ')
    # li=[]
    # with open(r'C:UserslenovoDesktopfinutf81urlist_gif', mode='r',encoding='utf-8') as f:
    # for i in f:
    # s=f.readline().strip(' ')
    # # print(s)
    # li.append(s.strip())
    # with open(r'C:UserslenovoDesktopfinutf81urlist_gif', mode='w',encoding='utf-8') as f:
    # for i in set(li):
    # f.write('http://xh.5156edu.com' + i + ' ')
    #
    # li=[]
    # with open(r'C:UserslenovoDesktopfinerror1urlist_gif', mode='r',encoding='utf-8') as f:
    # for i in f:
    # s=f.readline().strip()
    # # print(s)
    # li.append(s.strip())
    # with open(r'C:UserslenovoDesktopfinerror1urlist_gif', mode='w',encoding='utf-8') as f:
    # for i in set(li):
    # f.write( i + ' ')

    # li=[]
    # with open(r'C:UserslenovoDesktopfingbk1fingbk1urlist_gif', mode='r',encoding='utf-8') as f:
    # for i in f:
    # s=f.readline().strip()
    # # print(s)
    # li.append(s.strip())
    # with open(r'C:UserslenovoDesktopfingbk1fingbk1urlist_gif', mode='w',encoding='utf-8') as f:
    # for i in set(li):
    # f.write( i + ' ')




    # li=[]
    # with open(r'C:UserslenovoDesktopfinutf81urlist_gif', mode='r',encoding='utf-8') as f:
    # for i in f:
    # s=f.readline().strip(' ')
    # # print(s)
    # li.append(s.strip())
    # with open(r'C:UserslenovoDesktopfinutf81urlist_gif', mode='w',encoding='utf-8') as f:
    # for i in set(li):
    # f.write( i + ' ')


    # import os
    # import re
    # li=os.listdir(r'C:UserslenovoDesktopallkxzd')
    # os.chdir(r'C:UserslenovoDesktopallkxzd')
    # for i in li:
    # with open(i,mode='r',encoding='utf-8') as f:
    # response=f.read()
    # with open('urlist_gif',mode='a',encoding='utf-8') as f:
    # li=re.findall('src="(.*?)">',response,re.S)
    # print(li)
    # for i in li:
    # f.write('http://xh.5156edu.com'+i+' ')

    # li=[]
    # with open(r'C:UserslenovoDesktopallkxzdurlist_gif', mode='r',encoding='utf-8') as f:
    # for i in f:
    # s=f.readline().strip(' ')
    # # print(s)
    # li.append(s.strip())
    # with open(r'C:UserslenovoDesktopallkxzd ew_urlist_gif', mode='w',encoding='utf-8') as f:
    # for i in set(li):
    # f.write( i + ' ')
    #
    # import os
    # li=os.listdir(r'C:UserslenovoDesktopallkxzd')
    # os.chdir(r'C:UserslenovoDesktopallkxzd')
    # for i in li:
    # with open(i,mode='r+',encoding='utf-8') as f:
    # response=f.read()
    # if 'src="/kx_images%5' in response:
    # response=response.replace('src="/kx_images%5','src="')
    # with open(i,mode='w',encoding='utf-8') as f:
    # f.write(response)

    # import requests,os
    # import time,random
    # header={
    # 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    # }
    # with open(r'C:UserslenovoDesktopurlist_gif',mode='r')as f:
    # for i in f:
    # time.sleep(random.randint(2,3))
    # url=i.strip()
    # response=requests.get(url,headers=header).content
    # name=os.path.join(r'C:UserslenovoDesktopallkxzd',url.split('http://xh.5156edu.com/kx_images%5C')[1])
    # with open(name,mode='wb') as f:
    # f.write(response)



    # import requests,os
    # import time,random
    # header={
    # 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    # }
    # li=os.listdir(r'C:UserslenovoDesktopdownload')
    # lifin=['http://xh.5156edu.com/kx_images%5C'+i for i in li]
    # print(len(lifin))
    # with open(r'C:UserslenovoDesktopurlist_gif',mode='r')as f:
    # for i in f:
    # url=i.strip()
    # if url not in lifin:
    # time.sleep(random.randint(2, 3))
    # response=requests.get(url,headers=header).content
    # name=os.path.join(r'C:UserslenovoDesktopallkxzd',url.split('http://xh.5156edu.com/kx_images%5C')[1])
    # with open(name,mode='wb') as f:
    # f.write(response)
    # else:
    # print('done')
  • 相关阅读:
    .Net连接字符串设置连接池大小显著提高数据库速度
    转载:MongoDB之旅(超赞,适合初学者)
    MongoDB安装成为Windows服务及日常使用遇到问题总结
    开启Windows文件共享必须开启的两个服务
    Cocos2d-JS中瓦片地图API
    EF-CodeFirst 继承关系TPH、TPT、TPC
    MVC5-4 ViewResult
    MVC5-3 Result分析
    MVC5-2 MVC的管道流与路由
    MVC5-1 ASP.NET的管道流
  • 原文地址:https://www.cnblogs.com/diracy/p/14209772.html
Copyright © 2011-2022 走看看