通过这几天的学习,发现有些网页的爬取比较简单,比如小说,但是其他爬取北京市政百姓信件这个网页并没有想象中那么简单,在翻页的时候,网址并没有发生改变,后来通过询问同学,了解了ajax技术,通过scrapy框架和json来进行爬取,首先信件列表网页找到详细页面的url地址,发现网页中并没有完整的url地址,但是地址的区别主要是在后面,后面的数字是在网页中a标签的onclick属性中,获取了这串数字就可以找到具体的网页地址,之后在利用xpath进行信件相关内容的爬取。


# -*- coding: utf-8 -*-
import json
import random
import string
import scrapy
class XinjianSpider(scrapy.Spider):
name = 'xinjian'
allowed_domains = ['www.beijing.gov.cn']
# custome_setting可用于自定义每个spider的设置,而setting.py中的都是全局属性的,当你的
# scrapy工程里有多个spider的时候这个custom_setting就显得很有用了
custom_settings = {
"DEFAULT_REQUEST_HEADERS": {
'authority': 'www.beijing.gov.cn',
# 请求报文可通过一个“Accept”报文头属性告诉服务端 客户端接受什么类型的响应。
'accept': 'application/json, text/javascript, */*; q=0.01',
# 指定客户端可接受的内容编码
'accept-encoding': 'gzip, deflate',
# 指定客户端可接受的语言类型
'accept-language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive', # 就是告诉服务器我参数内容的类型,该项会影响传递是from data还是payload传递
'Content-Type': 'text/json',
# 跨域的时候get,post都会显示origin,同域的时候get不显示origin,post显示origin,说明请求从哪发起,仅仅包括协议和域名
'origin': 'http://www.beijing.gov.cn',
# 表示这个请求是从哪个URL过来的,原始资源的URI
'referer': 'http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow',
# 设置请求头信息User-Agent来模拟浏览器
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
# cookie也是报文属性,传输过去
'cookie': 'HDJLJSID=39DBD6D5E12B9F0F8834E297FAFC973B; __jsluid_h=e6e550159f01ae9aceff30d191b09911; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216f9edc47471cb-0059c45dfa78d6-c383f64-1049088-16f9edc474895%22%7D; _gscu_564121711=80128103kc5dx617; X-LB=1.1.44.637df82f; _va_ref=%5B%22%22%2C%22%22%2C1580462724%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DM-f5ankfbAnnYIH43aTQ0bvcFij9-hVxwm64pCc6rhCu5DYwg6xEVis-OVjqGinh%26wd%3D%26eqid%3Dd6b151bf000cfb36000000025e1c5d84%22%5D; _va_ses=*; route=74cee48a71a9ef78636a55b3fa493f67; _va_id=b24752d801da28d7.1578917255.10.1580462811.1580450943.',
}
}
# 需要重写start_requests方法
def start_requests(self):
# 网页里ajax链接
url = "http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.mailList.biz.ext"
# 所有请求集合
requests = []
for i in range(0, 33750, 1000):
random_random = random.random()
# 封装post请求体参数
my_data = {'PageCond/begin': i, 'PageCond/length': 1000, 'PageCond/isCount': 'true', 'keywords': '',
'orgids': '', 'startDate': '', 'endDate': '', 'letterType': '', 'letterStatue': ''}
# 模拟ajax发送post请求
request = scrapy.http.Request(url, method='POST',
callback=self.parse_model,
body=json.dumps(my_data),
encoding='utf-8')
requests.append(request)
return requests
def parse_model(self, response):
# 可以利用json库解析返回来得数据
jsonBody = json.loads(response.body)
print(jsonBody)
size = jsonBody['PageCond']['size']
data = jsonBody['mailList']
listdata = {}
fb1 = open('suggest.txt' , 'a')
fb2 = open('consult.txt' , 'a')
fb3 = open('complain.txt', 'a')
for i in range(size):
print(i)
listdata['letter_type'] = data[i]['letter_type']
listdata['original_id'] = data[i]['original_id']
if(listdata['letter_type']=="咨询"):
fb2.write(listdata['original_id'])
fb2.write('
')
if (listdata['letter_type'] == "建议"):
fb1.write(listdata['original_id'])
fb1.write('
')
else:
fb3.write(listdata['original_id'])
fb3.write('
')
#listdata['catalog_id'] = str(data[i]['catalog_id'])
#listdata['letter_title'] = data[i]['letter_title']
#listdata['create_date'] = data[i]['create_date']
#listdata['org_id'] = data[i]['org_id']
#listdata['letter_status'] = data[i]['letter_status']
print(listdata)
import random
import re
import requests
from lxml import etree
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'
}
def read():
f=open('D://consult.txt','r')
for id in f.readlines():
id = id.strip()
url2 = "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="+id
parser(url2)
f.close()
def write(contents):
f2 = open('D://zx3.txt', 'a+')
f2.write(contents.encode("gbk", 'ignore').decode("gbk", "ignore"))
print(contents, '写入成功')
f2.close()
def parser(url):
try:
response=requests.get(url,headers=header,timeout=15)
html=etree.HTML(response.text)
def process_num(num):
num =[re.sub(r"
|
| |xa0| ","",i) for i in num]
num=[i for i in num if len(i)>0]
return num
def process_content(content):
content=[re.sub(r"
|
| |xa0| ","",i)for i in content]
content=[i for i in content if len(i)>0]
return content
def process_jigou(jigou):
jigou=[re.sub(r"
|
| | ","",i)for i in jigou]
jigou=[i for i in jigou if len(i)>0]
return jigou
def process_hfcontent(hfcontent):
hfcontent=[re.sub(r"
|
| |xa0| ","",i)for i in hfcontent]
hfcontent=[i for i in hfcontent if len(i)>0]
return hfcontent
def process_person(person):
person=[re.sub(r"
|
| | ","",i)for i in person]
person=[i for i in person if len(i)>0]
return person
data_list={ }
data_list['type']="咨询"
data_list['title']=html.xpath("//div[@class='col-xs-10 col-sm-10 col-md-10 o-font4 my-2']//text()")
data_list['person']=html.xpath("//div[@class='col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted ']/text()")[0].lstrip('来信人:')
data_list['person']=process_person(data_list['person'])
data_list['date']=html.xpath("//div[@class='col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted ']/text()")[0].lstrip('时间:')
data_list['num']=html.xpath("//div[@class='col-xs-4 col-lg-3 col-sm-3 col-md-3 text-muted ']/label/text()")
data_list['num']=process_num(data_list['num'])[0]
data_list['content']=html.xpath("//div[@class='col-xs-12 col-md-12 column p-2 text-muted mx-2']//text()")
data_list['content']=process_content(data_list['content'])
data_list['jigou']=html.xpath("//div[@class='col-xs-9 col-sm-7 col-md-5 o-font4 my-2']/text()")
data_list['jigou']=process_jigou(data_list['jigou'])[0]
data_list['date2']=html.xpath("//div[@class='col-xs-12 col-sm-3 col-md-3 my-2 ']/text()")[0].lstrip('答复时间:')
data_list['hfcontent']=html.xpath("//div[@class='col-xs-12 col-md-12 column p-4 text-muted my-3']//text()")
data_list['hfcontent']=process_hfcontent(data_list['hfcontent'])
print(data_list)
write(data_list['type']+"||")
for i in data_list['title']:
write(i)
write("||")
for i in data_list['person']:
write(i)
write("||"+data_list['date'] + "||")
write(data_list['num'] + "||")
for i in data_list['content']:
write(i)
write("||"+data_list['jigou'] + "||")
write(data_list['date2'] + "||")
for i in data_list['hfcontent']:
write(i)
write("
")
except:
print("投诉爬取失败!")
if __name__=="__main__":
read()
获取的数据: