from bs4 import BeautifulSoup
from lxml import html
import xml
import requests
url = "http://share.zte.com.cn/tech/jsp/blogList?uid=10021031"
baseUrl="http://share.zte.com.cn"
abUrl="http://share.zte.com.cn/tech/jsp/"
#headers={'User-Agent': "Mozilla/5.0 (Windows 7 10.0; Win64; x64)
#AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36"}
#headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
f = requests.get(url,headers=headers)
html = f.text
#
#print(html)
file=open("1234.html",'wb')
file.write(str.encode(html))
file.close()
#Get该网页从而获取该html内容
soup = BeautifulSoup(f.content, "lxml") #用lxml解析器解析该网页的内容, 好像f.text也是返回的html
#print(f.content.decode()) #尝试打印出网页内容,看是否获取成功
#content = soup.find_all('div',class_="p12" ) #尝试获取节点,因为calss和关键字冲突,所以改名class_
#print(soup)
NewHtml='<!DOCTYPE html>
<head>
</head>
<body>
<table>
</table>
</body>
</html>'
newSoup=BeautifulSoup(NewHtml)
column=3
table=newSoup.find('table')
thead=newSoup.new_tag('thead')
table.append(thead)
for i in range(3):
tdt=newSoup.new_tag('td',width="33%")
tdt.string="conlumn"+str(i)
thead.append(tdt)
# 查找内容的div
divContent=soup.find('dl',class_='abstract_view')
count=0
tr=None
for kk in divContent.find_all('dd'):
if count%column==0 or tr==None:
if tr!=None:
table.append(tr)
tr=newSoup.new_tag('tr')
count=count+1
hhref=kk.find('a')['href']
newHref=abUrl+hhref
kk.find('a')['href']=newHref
a=newSoup.new_tag('a',href=newHref)
a.string=kk.find('a').string
td=newSoup.new_tag('td')
td.append(a)
tr.append(td)
#应该有更好方法实现
#if tr!=None:
# table.append(tr)
divPageFoot=soup.find('div',class_='W_pages')
#print(divPageFoot)
#找到最后一页的数字,简单写,这个url是拼出来的
firstPage=divPageFoot.find_all('a')[0]
hreff=firstPage["href"]
index=hreff.rfind("=")
hreff=hreff[0:index+1]
baseUrl=baseUrl+hreff
print(hreff)
lastPage=divPageFoot.find_all('a')[-1]
Pagenum=lastPage.string
print(Pagenum)
pageNumInt=int(Pagenum.strip(".."))
print(pageNumInt)
for k in range(2,pageNumInt+1) :
print(k)
url=baseUrl+str(k)
print(url)
ff = requests.get(url,headers=headers)
soupTemp = BeautifulSoup(ff.content, "lxml")
divContentTemp=soupTemp.find('dl',class_='abstract_view')
ddlist=divContentTemp.find_all('dd')
for kk in ddlist:
hhref=kk.find('a')['href']
kk.find('a')['href']=abUrl+hhref
divContent.append(kk)
if count%column==0 or tr==None:
if tr!=None:
table.append(tr)
tr=newSoup.new_tag('tr')
count=count+1
# hhref=kk.find('a')['href']
newHref=abUrl+hhref
kk.find('a')['href']=newHref
a=newSoup.new_tag('a',href=newHref)
a.string=kk.find('a').string
td=newSoup.new_tag('td')
td.append(a)
tr.append(td)
#应该有更好方法实现
if tr!=None:
table.append(tr)
file=open("345.html",'wb')
file.write(str.encode(newSoup.decode()))
file.close()
# if k==2:
# break
#print(divContent)
file=open("123.html",'wb')
file.write(str.encode(soup.decode()))
file.close()
#for k in soup.find_all('a',class_='nbg'):#,找到div并且class为pl2的标签
#a = k.find_all('span') #在每个对应div标签下找span标签,会发现,一个a里面有四组span
#print(k[0].string) #取第一组的span中的字符串
#print(k)
#print(type(k)) #取第一组的span中的字符串