作业一
(1)熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据
graph TD
A[解析当当网址] -->|print| B(先看一下有木有爬下来)
B --> C{xpath获取需要的item}
C -->|和mysql建立连接| D[试往里存]
C -->|都OK| F[run]
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# author: xm time:2020/10/27
import scrapy
from ..items import DangdangItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
class spider_dangdang(scrapy.Spider):
name = "spiderdangdang"
# key = 'python'
# source_url='http://www.dangdang.com/'
# start_urls=["http://search.dangdang.com/?key=python&act=input&page_index=2"]
def start_requests(self):
url="http://search.dangdang.com/?key=python&act=input"
print(url)
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
try:
dammit = UnicodeDammit(response.body,["utf-8",'gbk'])
data = dammit.unicode_markup
selector=scrapy.Selector(text=data)
lis = selector.xpath("//ul[@class='bigimg']/li")
for li in lis:
title = li.xpath("./p[@class='name']/a/@title").extract_first()
author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
date = li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first()
publisher = li.xpath("./p[@class='search_book_author']/span[position()=3]/a/@title").extract_first()
detail = li.xpath("./p[@class='detail']/text()").extract_first()
# print(title)
item=DangdangItem()
item["title"]=title.strip() if title else ""
item["author"]=author.strip() if author else ""
item['price']=price.strip() if price else ""
item['date']=date.strip()[1:] if price else ""
item['publisher']=publisher.strip() if publisher else ""
item['detail']=detail.strip() if detail else ""
yield item
except Exception as err:
print(err)
import scrapy
class DangdangItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
publisher = scrapy.Field()
date = scrapy.Field()
price = scrapy.Field()
detail = scrapy.Field()
import pymysql
class DangdangPipeline:
def open_spider(self,spider):
print("opened")
try:
self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",
passwd="031804114.hao",db="mydb",charset="utf8")
self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
self.cursor.execute("delete from books")
self.opened=True
self.count=0
except Exception as err:
print(err)
self.opened=False
def close_spider(self,spider):
if self.opened:
self.con.commit()
self.con.close()
self.opened=False
print("closed")
print("总共爬取",self.count,"本书籍")
def process_item(self, item, spider):
try:
if self.opened:
self.cursor.execute("insert into books(bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values (%s,%s,%s,%s,%s,%s)",(item["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
self.count+=1
except Exception as err:
print(err)
return item
(2)心得体会
Xpath的解析标签要详细一点,还有python3的去除字符的空格,str.strip()去除两端的空格;mysql记得创建表的时候加一句 character set = utf8,默认的不是utf8,不然就给你存的时候搞事情
作业二
(1)熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# author: xm time:2020/10/28
import scrapy
import json
import re
import math
from ..items import GupiaomysqlItem
class spider_gupiao(scrapy.Spider):
name = "spidergupiao"
start_urls=["http://11.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1603874731034"]
def parse(self, response):
try:
sites = json.loads(response.body_as_unicode())
data = sites["data"]
diff = data["diff"]
print(diff)
print(len(diff))
for i in range(len(diff)):
item=GupiaomysqlItem()
item["mount"]=str(i)
item["code"]=str(diff[i]["f12"])
item["name"]=str(diff[i]["f14"])
item["lately"]=str(diff[i]["f2"])
item["zhangdiefu"]=str(diff[i]["f3"])
item["zhangdiee"]=str(diff[i]["f4"])
item["chengjiaoliang"]=str(diff[i]["f5"])
item["chengjiaoe"]=str(diff[i]["f6"])
item["zhenfu"]=str(diff[i]["f7"])
item["zuigao"]=str(diff[i]["f15"])
item["zuidi"]=str(diff[i]["f16"])
item["jinkai"]=str(diff[i]["f17"])
item["zuoshou"]=str(diff[i]["f18"])
yield item
#all_page = math.ceil(eval(re.findall('"total":(d+)', response.body.decode())[0]) / 20)
page = re.findall("pn=(d+)", response.url)[0] # 当前页数
if int(page) < 5: # 爬取页数
url = response.url.replace("pn=" + page, "pn=" + str(int(page) + 1)) # 跳转下一页
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url, self.parse)
except Exception as err:
print(err)
import scrapy
class GupiaomysqlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
mount = scrapy.Field()
code = scrapy.Field()
name = scrapy.Field()
lately = scrapy.Field()
zhangdiefu = scrapy.Field()
zhangdiee = scrapy.Field()
chengjiaoliang = scrapy.Field()
chengjiaoe = scrapy.Field()
zhenfu = scrapy.Field()
zuigao = scrapy.Field()
zuidi = scrapy.Field()
jinkai = scrapy.Field()
zuoshou = scrapy.Field()
from itemadapter import ItemAdapter
import pymysql
class GupiaomysqlPipeline:
def open_spider(self,spider):
print("opened")
try:
self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",
passwd="031804114.hao",db="gupiao",charset="utf8")
self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
self.cursor.execute("delete from information")
self.opened=True
self.count=0
except Exception as err:
print(err)
self.opened=False
def close_spider(self,spider):
if self.opened:
self.con.commit()
self.con.close()
self.opened=False
print("closed")
print("总共爬取",self.count,"条")
def process_item(self, item, spider):
try:
print(item["mount"])
print()
if self.opened:
self.cursor.execute("insert into information(id,bno,bname,bLatestPrice,bZhangDieFu,bZhangDieE,bChengJiaoLiang,bChengJioaE,bZhenFu,bZuiGao,bZuiDi,bJinKai,bZuoShou) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(item["mount"],item["code"],item["name"],item["lately"],item["zhangdiefu"],item["zhangdiee"],item["chengjiaoliang"],item["chengjiaoe"],item["zhenfu"],item["zuigao"],item["zuidi"],item["jinkai"],item["zuoshou"]))
self.count+=1
except Exception as err:
print(err)
return item
(2)心得体会
主要还是根据上次爬股票的小爬5页,数据库的操作和实验一一样
作业三
(1)熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# author: xm time:2020/10/30
import scrapy
from bs4 import UnicodeDammit
from ..items import WaihuimysqlItem
class spider_waihui(scrapy.Spider):
name = "spiderwaihui"
start_urls=["http://fx.cmbchina.com/hq/"]
def parse(self, response):
try:
dammit = UnicodeDammit(response.body, ["utf-8", 'gbk'])
data = dammit.unicode_markup
# print(data)
selector = scrapy.Selector(text=data)
trs = selector.xpath("//div[@id='realRateInfo']/table/tr")
# print(trs)
for tr in trs[1:]:
item=WaihuimysqlItem()
a =tr.xpath("./td[position()=1][@class='fontbold']/text()").extract_first()
item["type"] = str(a).strip()
item["tsp"] = str(tr.xpath("./td[position()=4][@class='numberright']/text()").extract_first()).strip()
item["csp"] = str(tr.xpath("./td[position()=5][@class='numberright']/text()").extract_first()).strip()
item["tbp"] = str(tr.xpath("./td[position()=6][@class='numberright']/text()").extract_first()).strip()
item["cbp"] = str(tr.xpath("./td[position()=7][@class='numberright']/text()").extract_first()).strip()
item["time"] = str(tr.xpath("./td[position()=8][@align='center']/text()").extract_first()).strip()
yield item
except Exception as err:
print(err)
import scrapy
class WaihuimysqlItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
type = scrapy.Field()
tsp = scrapy.Field()
csp = scrapy.Field()
tbp = scrapy.Field()
cbp = scrapy.Field()
time = scrapy.Field()
import pymysql
class WaihuimysqlPipeline:
def open_spider(self,spider):
print("opened")
try:
self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",
passwd="031804114.hao",db="mydb",charset="utf8")
self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
self.cursor.execute("delete from waihui")
self.opened=True
self.count=0
except Exception as err:
print(err)
self.opened=False
def close_spider(self,spider):
if self.opened:
self.con.commit()
self.con.close()
self.opened=False
print("closed")
print("总共爬取",self.count,"条")
def process_item(self, item, spider):
try:
if self.opened:
self.cursor.execute("insert into waihui(btype,btsp,bcsp,btbp,bcbp,btime) values (%s,%s,%s,%s,%s,%s)",(item["type"],item["tsp"],item["csp"],item["tbp"],item["cbp"],item["time"]))
self.count+=1
except Exception as err:
print(err)
return item
(2)心得体会
有上面两个实验这个很简单,就是去除空格就好了