作业一:
代码:
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import urllib.parse
import time
start_time=time.clock()
start_url="http://www.weather.com.cn/weather/101280601.shtml"
headers={
"User-Agent":"Mozilla/5.0(Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2"
}
def imageSpider(start_url):
try:
urls=[]
req=urllib.request.Request(start_url,headers=headers)
data=urllib.request.urlopen(req)
data =data.read()
dammit=UnicodeDammit(data,["utf-8","gbk"])
data=dammit.unicode_markup
soup=BeautifulSoup(data,"lxml")
images=soup.select("img")
for image in images:
try:
src=image["src"]
url=urllib.parse.urljoin(start_url,src)
if url not in urls:
urls.append(url)
print(url)
download(url)
except Exception as err:
print(err)
except Exception as err:
print(err)
def download(url):
global count
try:
count=count+1
if(url[len(url)-4]=='.'):
ext=url[len(url)-4:]
else:
ext=''
req=urllib.request.Request(url,headers=headers)
data=urllib.request.urlopen(req,timeout=100)
data=data.read()
fobj=open("images\"+str(count)+ext,"wb")
fobj.write(data)![](https://img2020.cnblogs.com/blog/2145485/202010/2145485-20201020220214167-737185274.png)
fobj.close()
print("download"+str(count)+ext)
except Exception as err:
print(err)
count=0
imageSpider(start_url)
end_time=time.clock()
print("耗时:",end_time-start_time)
结果展示:
代码:
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import urllib.parse
import threading
import time
start_time=time.clock()
start_url="http://www.weather.com.cn/weather/101280601.shtml"
headers={
"User-Agent":"Mozilla/5.0(Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2"
}
def imageSpider(start_url):
global threads
global count
try:
urls=[]
req=urllib.request.Request(start_url,headers=headers)
data=urllib.request.urlopen(req)
data=data.read()
dammit=UnicodeDammit(data,["utf-8","gbk"])
data = dammit.unicode_markup
soup=BeautifulSoup(data,'lxml')
images=soup.select("img")
for image in images:
try:
src=image["src"]
url=urllib.parse.urljoin(start_url,src)
if url not in urls:
print(url)
count=count+1
T=threading.Thread(target=download,args=(url,count))
T.setDaemon(False)
T.start()
threads.append(T)
except Exception as err:
print(err)
except Exception as err:
print(err)
def download(url,count):
try:
if (url[len(url) - 4] == '.'):
ext = url[len(url) - 4:]
else:
ext = ''
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req, timeout=100)
data = data.read()
fobj = open("images\" + str(count) + ext, "wb")
fobj.write(data)
fobj.close()
print("download" + str(count) + ext)
except Exception as err:
print(err)
count=0
threads=[]
imageSpider(start_url)
for t in threads:
t.join()
end_time=time.clock()
print("The End")
print("耗时:",end_time-start_time)
结果展示:
心得:
相比下多线程爬取的效率快很多,但爬取到的图片效果上有些重复要做些调整
作业二:
WeatherSpider.py:
import scrapy
from scrapy.selector import Selector
from Spider.items import ImgItem
class WeatherSpider(scrapy.Spider):
name = 'WeatherSpider'
def start_requests(self):
url='http://www.weather.com.cn/weather/101280601.shtml'
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
print(response.url)
data=response.body.decode()
#print(data)
selector = Selector(text=data)#建立Selector对象,使用Xpath查找元素
print(selector)
s= selector.xpath("//img/@src").extract()#提取标签中img的资源地址src
print(s)#s为图片地址形成的列表
# print(s[0])
for src in s:
item=ImgItem()
item["src"]=src
yield item
items.py:
import scrapy
class ImgItem(scrapy.Item):
src = scrapy.Field()
pass
pipelines.py
import urllib
from itemadapter import ItemAdapter
class SpiderPipeline(object):
count=0
def process_item(self, item, spider):
SpiderPipeline.count+=1
try:
url=item["src"]
print(url)
if (url[len(url) - 4] == '.'):
ext = url[len(url) - 4:]
else:
ext = ''
req = urllib.request.Request(url)
data = urllib.request.urlopen(req)
data = data.read()
fobj = open("F:\数据采集\10_14\images\" + str(SpiderPipeline.count) + ext, "wb")
fobj.write(data)
fobj.close()
print("download" + str(SpiderPipeline.count) + ext)
except Exception as err:
print(err)
return item
settings.py取消该段代码注释:
ITEM_PIPELINES = {
'Spider.pipelines.SpiderPipeline': 300,
}
run.py:
from scrapy import cmdline
cmdline.execute("scrapy crawl shareSpider -s LOG_ENABLED=False".split())
结果展示:
心得:
学习使用"/@attrName"得到一个Selector元素的attrName属性结点对象,通过extract()获取属性值
作业三:
代码:
shareSpider.py:
import re
from Spider.items import lineItem
import scrapy
from Spider.pipelines import SpiderPipeline
from scrapy.selector import Selector
class shareSpider(scrapy.Spider):
name="shareSpider"
def start_requests(self):
url = 'http://77.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124041523442512990894_1603196582234&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18&_=1603196582235'
# 在url中pn参数是第i页,pz参数是返回i条股票信息,f2:"最新报价"f3:"涨跌幅"f4:"涨跌额"f5:"成交量"f6:"成交额"f7:"振幅"f12:"股票代码"f14:"股票名称"f15:"最高"f16:"最低"f17:"今开"f18:"昨收"
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
data=response.body.decode()
data = re.findall(r'"diff":[(.*?)]',data)
datas = data[0].strip("{").strip("}").split("},{")
#print(datas)
for data_line in datas: # 按行处理数据
line_item=data_line.split(',')
item=lineItem()
item["id"]=line_item[6].split(":")[1]
item["name"]=line_item[7].split(":")[1]
item["new_price"]=line_item[0].split(":")[1]
item["up_rate"]=line_item[1].split(":")[1]
item["down_rate"]=line_item[2].split(":")[1]
item["pass_number"] = line_item[3].split(":")[1]
item["pass_money"] = line_item[4].split(":")[1]
item["rate"] = line_item[5].split(":")[1]
item["highest"] = line_item[8].split(":")[1]
item["lowest"] = line_item[9].split(":")[1]
item["today"] = line_item[10].split(":")[1]
item["yesterday"] = line_item[11].split(":")[1]
yield item
print(SpiderPipeline.tb)
pipelines.py:
import prettytable as pt
class SpiderPipeline(object):
count = 0
tb = pt.PrettyTable(["序号", "股票代码", "股票名称", "最新报价", "涨跌幅", "涨跌额", "成交量", "成交额", "振幅", "最高", "最低", "今开", "昨收"])
def process_item(self, item, spider):
SpiderPipeline.count+=1
SpiderPipeline.tb.add_row(
[SpiderPipeline.count, item["id"], item["name"], item["new_price"], item["up_rate"], item["down_rate"],
item["pass_number"], item["pass_money"], item["rate"], item["highest"], item["lowest"], item["today"],
item["yesterday"]])
return item
items.py:
import scrapy
class lineItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
id=scrapy.Field()
name=scrapy.Field()
new_price=scrapy.Field()
up_rate=scrapy.Field()
down_rate=scrapy.Field()
pass_number=scrapy.Field()
pass_money=scrapy.Field()
rate=scrapy.Field()
highest=scrapy.Field()
lowest=scrapy.Field()
today=scrapy.Field()
yesterday=scrapy.Field()
pass
settings.py中修改:
ROBOTSTXT_OBEY = False
取消注释:
ITEM_PIPELINES = {
'Spider.pipelines.SpiderPipeline': 300,
}
run.py:
from scrapy import cmdline
cmdline.execute("scrapy crawl shareSpider -s LOG_ENABLED=False".split())
结果展示:
心得:
理清楚Spider中的数据获取与pipeline中的数据处理后,结合上次流程,实现起来就不会太难了。在刚开始遇到shareSpider中parse没有运行的情况,修改了ROBOTSTXT_OBEY = False。
ROBOTSTXT应该是robots 协议内容,也叫机器人协议,它用来限定爬虫程序可以爬取的内容范围,通常写在 robots.txt 文件中,该文件保存在网站的服务器上。
爬虫程序访问网站时首先查看此文件。修改了ROBOTSTXT_OBEY = False,爬取内容不符合该协议且仍要爬取时