zoukankan      html  css  js  c++  java
  • 爬虫

        from bs4 import BeautifulSoup
    from lxml import html
    import xml
    import requests
    
    url = "http://share.zte.com.cn/tech/jsp/blogList?uid=10021031"
    baseUrl="http://share.zte.com.cn"
    abUrl="http://share.zte.com.cn/tech/jsp/"
    
    #headers={'User-Agent': "Mozilla/5.0 (Windows 7 10.0; Win64; x64) 
     #AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36"}
    #headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
    
    headers={
    
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
    }
    f = requests.get(url,headers=headers)
    
    html = f.text
    #
    #print(html)
    
    file=open("1234.html",'wb')
    file.write(str.encode(html))
    file.close()
                    #Get该网页从而获取该html内容
    soup = BeautifulSoup(f.content, "lxml")  #用lxml解析器解析该网页的内容, 好像f.text也是返回的html
    #print(f.content.decode())								#尝试打印出网页内容,看是否获取成功
    #content = soup.find_all('div',class_="p12" )   #尝试获取节点,因为calss和关键字冲突,所以改名class_
    #print(soup)
    
    NewHtml='<!DOCTYPE html>
            <head>
            </head>
            <body>
            <table> 
            </table> 
            </body>
            </html>'
    newSoup=BeautifulSoup(NewHtml)
    column=3
    table=newSoup.find('table')
    thead=newSoup.new_tag('thead')
    table.append(thead)
    for i in range(3):
        tdt=newSoup.new_tag('td',width="33%")
        tdt.string="conlumn"+str(i)
        thead.append(tdt)
        
    
    
    # 查找内容的div
    divContent=soup.find('dl',class_='abstract_view')
    count=0
    tr=None
    
    for kk in divContent.find_all('dd'):
        if count%column==0 or tr==None:
            if tr!=None:
                table.append(tr)
    
            tr=newSoup.new_tag('tr')
        count=count+1
    
        hhref=kk.find('a')['href']
        newHref=abUrl+hhref
        kk.find('a')['href']=newHref
        a=newSoup.new_tag('a',href=newHref)
        a.string=kk.find('a').string
        td=newSoup.new_tag('td')
        td.append(a)
        tr.append(td)
    
     #应该有更好方法实现
    #if tr!=None:
     #  table.append(tr) 
         
    
    
    
    divPageFoot=soup.find('div',class_='W_pages')
    #print(divPageFoot)
    #找到最后一页的数字,简单写,这个url是拼出来的
    firstPage=divPageFoot.find_all('a')[0]
    hreff=firstPage["href"]
    index=hreff.rfind("=")
    hreff=hreff[0:index+1]
    baseUrl=baseUrl+hreff
    print(hreff) 
    
    lastPage=divPageFoot.find_all('a')[-1]  
    Pagenum=lastPage.string
    print(Pagenum)
    pageNumInt=int(Pagenum.strip(".."))
    print(pageNumInt)
    
    
    for k  in range(2,pageNumInt+1) :
        print(k)
        url=baseUrl+str(k)
        print(url)
        ff = requests.get(url,headers=headers)
        soupTemp = BeautifulSoup(ff.content, "lxml")
        divContentTemp=soupTemp.find('dl',class_='abstract_view')
        ddlist=divContentTemp.find_all('dd')
        for kk in ddlist:
            hhref=kk.find('a')['href']
            kk.find('a')['href']=abUrl+hhref
            divContent.append(kk)
            if count%column==0 or tr==None:
                if tr!=None:
                    table.append(tr)
    
                tr=newSoup.new_tag('tr')
            count=count+1
    
           # hhref=kk.find('a')['href']
            newHref=abUrl+hhref
            kk.find('a')['href']=newHref
            a=newSoup.new_tag('a',href=newHref)
            a.string=kk.find('a').string
            td=newSoup.new_tag('td')
            td.append(a)
            tr.append(td)
        
        
      #应该有更好方法实现
    if tr!=None:
       table.append(tr) 
    
    
    file=open("345.html",'wb')
    file.write(str.encode(newSoup.decode()))
    file.close()
         
    
    #    if k==2:
        #    break
    
    #print(divContent)
    file=open("123.html",'wb')
    file.write(str.encode(soup.decode()))
    file.close()
    #for k in soup.find_all('a',class_='nbg'):#,找到div并且class为pl2的标签
       #a = k.find_all('span')       #在每个对应div标签下找span标签,会发现,一个a里面有四组span
       #print(k[0].string)            #取第一组的span中的字符串
       #print(k)
       #print(type(k))            #取第一组的span中的字符串
    
    
  • 相关阅读:
    完整约束二(学习笔记)
    完整约束一(学习笔记)
    表的创建与管理二(学习笔记)
    闪回技术(学习笔记)
    表的创建与管理一(学习笔记)
    借助AWR报告分析解决oracleCPU过高的问题(转)
    数据的集合运算(学习笔记)
    SQL:1999基本语法(学习笔记)
    表的连接操作(学习笔记)
    多表查询(学习笔记)
  • 原文地址:https://www.cnblogs.com/meiguhuaxian/p/12540701.html
Copyright © 2011-2022 走看看