使用环境
- Python3.6
- Django1.11.11
- Xadmin 后台管理插件
- MySQL数据库
文件目录结构
文件目录结构
数据表结构 models.py
from django.db import models
from django.contrib.auth.models import AbstractUser
import time
"""
学习到的知识:
1) 一个表需要关联多次同一个表时,需要重命名正向查询(related_name)和反向查询明字(related_query_name)
2) 索引的使用:db_index=True
"""
# Create your models here.
# 用户表
class UserInfo(AbstractUser):
gender = models.CharField('性别', max_length=32) # 性别
phone = models.CharField('手机号', max_length=32) # 手机号
name = models.CharField('姓名',max_length=32) # 姓名
ID_number = models.CharField('身份证号', max_length=32) # 身份证号
def __str__(self):
return str(self.pk)+'----'+str(self.username)
class Meta:
verbose_name_plural = '用户表'
# 车站表
class Station(models.Model):
id = models.IntegerField(primary_key=True)
station_name = models.CharField('车站名称', max_length=32) # 车站名称
english = models.CharField('英文编码', max_length=32, db_index=True) # 英文编码
spell = models.CharField('拼音', max_length=32) # 拼音
spell_brief = models.CharField('拼音简', max_length=32) # 拼音简
city = models.ForeignKey(verbose_name='关联城市', to='City', db_index=True) # 关联城市
def __str__(self):
return str(self.station_name)
class Meta:
verbose_name_plural = '车站表'
# 城市
class City(models.Model):
city_name = models.CharField('城市列表', max_length=32) # 城市列表
def __str__(self):
return str(self.city_name)
class Meta:
verbose_name_plural = '城市'
# 列车表
class Train(models.Model):
train_size = models.CharField('列车号', max_length=32, db_index=True) # 列车号
train_coding = models.CharField('列车编码', max_length=32) # 列车编码
# 一个表需要关联多次同一个表时,需要重命名正向查询(related_name)和反向查询明字(related_query_name)
start_stand = models.ForeignKey(verbose_name='起始站', to='Station', related_name='related_start_stand') # 起始站
terminus = models.ForeignKey(verbose_name='终点站', to='Station', related_name='related_terminus') # 终点站
depart = models.CharField('始发时间', max_length=32) # 始发时间
arrive = models.CharField('到达时间', max_length=32) # 到达时间
coach_num = models.CharField('车厢数', max_length=32, default=7) # 车厢数
station = models.ManyToManyField(verbose_name='关联列车进站时间表', to='Station', through='Station2Train',
through_fields=('train', 'station'))
def __str__(self):
return str(self.train_size)
class Meta:
verbose_name_plural = '列车表'
# 列车进站时间表 车站——列车多对多表
class Station2Train(models.Model):
station = models.ForeignKey(verbose_name='关联车站表', to='Station') # 关联车站表
train = models.ForeignKey(verbose_name='关联列车表', to='Train') # 关联列车表
station_next = models.CharField('站次(这趟车第几次经过)', max_length=32) # 站次(这趟车第几次经过)
arrive_time = models.CharField('到达时间', max_length=32) # 到达时间
depart_time = models.CharField('出发时间', max_length=32) # 出发时间
distance = models.CharField('和上一站的距离', max_length=32) # 和上一站的距离
is_state = models.CharField('是起终停', max_length=32) # 是起终停
def __str__(self):
return 'station' + '的到达时间:' + str(self.arrive_time) + ' 出发时间' + str(self.depart_time)
class Meta:
unique_together = ("station", "train")
verbose_name_plural = '列车进站时间表'
# 车座表
class Seat(models.Model):
choices = (
(1, '商务座'),
(2, '一等座'),
(3, '二等座'),
(4, '高级软卧'),
(5, '高级硬卧'),
(6, '硬座'),
(7, '无座')
)
train = models.ForeignKey(verbose_name='关联列车表', to='Train',null=True,db_index=True) # 关联列车表
coach_size = models.CharField('车厢号', max_length=32) # 车厢号
seat_type = models.IntegerField('座位类型', choices=choices) # 座位类型
seat_size = models.CharField('座位号', max_length=32) # 座位号
is_sell = models.CharField('出售情况', max_length=64, null=True,db_index=True) # 出售情况
def __str__(self):
return str(self.seat_type)
class Meta:
verbose_name_plural = '车座表'
# 邮箱验证码
class EmailVerifyRecord(models.Model): # 邮箱验证码
code = models.CharField(max_length=20, verbose_name=u"验证码")
email = models.EmailField(max_length=50, verbose_name=u"邮箱")
send_type = models.CharField(choices=(('register', u"注册"), ('forget', u"找回密码")), max_length=10)
send_time = models.DateTimeField(auto_now=True) # 获取时间
class Meta:
verbose_name = u"邮箱验证码"
verbose_name_plural = verbose_name
# 车票表
class Ticket(models.Model):
ticket = models.CharField('车票号', max_length=32, primary_key=True) # 车票号
train_size = models.ForeignKey(verbose_name='关联列车号', to='Train') # 列车号
coach_size = models.CharField('车厢号', max_length=32) # 车厢号
seat_size = models.CharField('座位号', max_length=32) # 座位号
user = models.ForeignKey(verbose_name='关联用户表', to='UserInfo') # 关联用户表
price = models.CharField('价格', max_length=32) # 价格
pay_type = models.CharField('支付方式', max_length=32) # 支付方式
depart_time = models.CharField('出发时间', max_length=32) # 出发时间
arrive_time = models.CharField('到达时间', max_length=32) # 到达时间
depart_stand = models.CharField('出发站', max_length=32) # 出发站
arrive_stand = models.CharField('到达站', max_length=32) # 到达站
buy_time = models.CharField('购买时间', max_length=32) # 购买时间
is_quit = models.CharField('是否退票', max_length=32) # 是否退票
def __str__(self):
return str(self.ticket)
class Meta:
verbose_name_plural = '车票表'
# 爬虫IP表
class IP(models.Model):
ip = models.CharField('IP地址', max_length=32)
port = models.CharField('端口号', max_length=32)
expire_time = models.CharField('过期时间', max_length=32)
city = models.CharField('地区', max_length=32)
def __str__(self):
return str(self.ip) + ':' + str(self.port)
class Meta:
verbose_name_plural = '爬虫IP表'
爬取所需代理ip ip.py
(爬取可能失效 2019-7-4,自己可以先学一下request)
import requests
from app01 import models
class Get_IP():
# 校验
def __init__(self):
pass
def select_ip(self):
ip_obj = models.IP.objects.filter(id=1).first()
return ip_obj
def zhimaruanjian(self,url=None):
"""
http://webapi.http.zhimacangku.com/getip?
使用的芝麻代理 http://webapi.http.zhimacangku.com
"""
requests.session()
if not url:
url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&pack=自己的pack号&ts=1&ys=0&cs=1&lb=1&sb=0&pb=4&mr=1®ions='
r = requests.get(url)
res = r.json()
if res['code'] == 0:
ip = res['data'][0]['ip']
port = res['data'][0]['port']
expire_time = res['data'][0]['expire_time']
city = res['data'][0]['city']
res = models.IP.objects.filter(id=1).first()
print('获取到新ip %s'%(str(ip) +':'+ str(port)))
if res:
models.IP.objects.filter(id=1).update(ip=ip, port=port, expire_time=expire_time, city=city)
else:
res = models.IP(ip=ip, port=port, expire_time=expire_time, city=city)
res.save()
else:
return True
if __name__ == '__main__':
res = Get_IP()
res.zhimaruanjian()
数据的定制爬取
(截止2019-7-4,数据统计:爬取车站总数2863个,涉及城市1260个,车站停靠数86037个,自制座位数5244727条( 车站停靠数86037个 X 列车数7节 X 每车厢100座位))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cp12306.settings")
application = get_wsgi_application()
from app01 import models
import xlrd
import json
import time
import requests
import random
from lxml import etree
from django.db.models import Count
from app01.myfile.ip import Get_IP
from concurrent.futures import ThreadPoolExecutor # 设置多线程池
Get_IP = Get_IP()
"""
用到是知识点:
1) bulk_create 批量添加数据
2) xlrd 操作Excel表
3) list(set(city_name_list)) 列表 利用set的自动去重功能
4) 分组和聚合函数使用:
train_list = models.Train.objects.annotate(a = Count('station2train__train')).values('train_size','a')
"""
# 添加城市
def city():
data = xlrd.open_workbook('火车站信息表.xlsx') # 打开Excel表
city_name_list = data.sheets()[0].col_values(5) # 获取需要的数据
query_list = []
for x, i in enumerate(list(set(city_name_list)), 1):
City_obj = models.City(id=x, city_name=i)
if not City_obj in query_list:
query_list.append(City_obj)
try:
print('城市列表添加完成!')
models.City.objects.bulk_create(query_list)
except:
print('城市列表已经存在!')
# 添加车站
def station():
# 查询城市代码id
city_list = models.City.objects.all().values('id', 'city_name')
city_dic = {city.get('city_name'): city.get('id') for city in city_list}
# 添加
data = xlrd.open_workbook('火车站信息表.xlsx').sheets()[0] # 打开Excel表
query_list = []
for i in range(data.nrows):
data_col = data.row_values(i) # 获取excel一行数据
station_obj = models.Station(id=i + 1, station_name=data_col[1], english=data_col[2], spell=data_col[3],
spell_brief=data_col[4], city_id=city_dic.get(data_col[5]))
query_list.append(station_obj) # 把所有对象,添加到列表中
try:
pass
models.Station.objects.bulk_create(query_list)
print('车站列表添加完成!')
except:
print('车站列表已经存在!')
#
# # 添加座位类型
# def seat_type():
# seat_list = ['商务座', '一等座', '二等座', '高级软卧', '高级硬卧', '硬座', '无座']
# query_list = []
# for id, seat_type in enumerate(seat_list, 1):
# query_list.append(models.Seat_Type(id=id, seat_type=seat_type))
#
# # 批量插入数据库之bulk_create()
# try:
# models.Seat_Type.objects.bulk_create(query_list)
# print('座位类型添加完成!')
# except:
# print('座位类型已经存在!')
# 爬取列车数据
def pa(station_dic,train_size):
"""
需要的数据:
本列车
起始站、
终点站、
始发时间、
到达时间、
站次、
途径站中到达时间、
途径站中出发时间、
到达站、
和上一站距离、
状态是起终停
:param train_size:
:param train_coding:
:return:
"""
# 代理ip 地址,随机IP地址
def get_ip():
ip_obj = Get_IP.select_ip()
ip = ip_obj.ip + ":" + ip_obj.port
proxies = {
'http': ip,
'https': ip
}
print(proxies)
return proxies
url = 'http://checi.114piaowu.com/{}'.format(train_size)
requests.Session()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate',
'Cookie': 'CLIENT_SOURCE=baidu_www.baidu.com; CLIENT_FIRST_ENTER=pc_shike; tostation=%E5%88%B0%E8%BE%BE%E5%9F%8E%E5%B8%82; UM_distinctid=16bb7da72431fe-07a5e239d1d8e6-37677e02-1aeaa0-16bb7da7244977; JSESSIONID=DF0894D3C3B6127C656BF6ADF714674E; fromstation=%E9%98%BF%E5%B0%94%E5%B1%B1; CLIENT_LAST_ENTER=pc_checi',
'Host': 'checi.114piaowu.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_url():
r = requests.get(url=url, headers=headers, proxies=get_ip(), timeout=10)
print(r.status_code)
return r
try:
try:
try:
try:
r = get_url()
except requests.exceptions.ConnectTimeout or requests.exceptions.ProxyError: # 代理无效错误
print('代理ip无效')
Get_IP.zhimaruanjian()
return False
except requests.exceptions.ReadTimeout:# 读取超时错误
print('读取超时')
return False
except requests.exceptions.TooManyRedirects:
print('ip被限制')
try:
Get_IP.zhimaruanjian()
r = get_url()
except requests.exceptions.ProxyError:
try:
Get_IP.zhimaruanjian()
r = get_url()
except:
print('未知错误!')
return False
except :
print('未知错误!')
Get_IP.zhimaruanjian()
return False
# 对获取的页面进行解析
selector = etree.HTML(r.text)
func_dic = {}
# 把需要的数据写入字典
try:
res = selector.xpath("//dd[@class='line']/ul/li/a/text()")
func_dic['start_stand'] = station_dic[res[0]] # 起始站
func_dic['terminus'] = station_dic[res[1]] # 终点站
res = selector.xpath("//dd[@class='line']/ul/li/text()")
func_dic['depart'] = res[0] # 始发时间
func_dic['arrive'] = res[1].split('(')[0] # 到达时间
res = selector.xpath("//div[@class='list']/table//tr")
func_dic['data'] = {}
except:
return False
number = 0
# 把数据进行封装 格式 func_dic = {'start_stand':start_stand, ... 'data':{id:[]}}
for each in res:
numbers = each.xpath("./td[5]/text()")
if numbers:
numbers = numbers[0]
# 获取车站代码对应的id
try:
station_next = each.xpath("./td[1]/text()")[0] # 站次
station = station_dic[each.xpath("./td[2]/a/text()")[0]] # 到达站
arrive_time = each.xpath("./td[3]/text()")[0] # 到达时间
depart_time = each.xpath("./td[4]/text()")[0] # 出发时间
if numbers == '--':
numbers = 0
# func_list.append()
distance = int(numbers) # 和上一站的距离
# distance = int(numbers) - int(number) # 和上一站的距离
# number = numbers
if station == func_dic['terminus']:
is_state = '终' # 是起终停
elif each.xpath("./td[3]/text()")[0] == '--':
is_state = '起' # 是起终停
else:
is_state = '暂' # 是起终停
except KeyError:
return False
func_dic['data'][station_next] = [station, arrive_time, depart_time, distance, is_state]
return func_dic
# 保存车列表
def train_save(dic):
train_size = dic.get('train_size') #列车号
train_coding = dic.get('train_coding') #列车编码
start_stand = models.Station.objects.filter(id=dic.get('start_stand')).first() #起始站
terminus = models.Station.objects.filter(id=dic.get('terminus')).first() #终点站
depart = dic.get('depart') #始发时间
arrive = dic.get('arrive') #到达时间
train_obj = models.Train.objects.create(train_size=train_size,train_coding=train_coding,
start_stand=start_stand,terminus=terminus,depart=depart,arrive=arrive )
data = dic.get('data') #多对多数据
for key,val in data.items():
station_next = key #站次
station = models.Station.objects.filter(id=val[0]).first() #起始站 #关联列车表
train = train_obj #关联车站表
arrive_time = val[1] #到达时间
depart_time = val[2] #出发时间
distance = val[3] #和上一站的距离
is_state = val[4] #是起终停
models.Station2Train.objects.create(station_next=station_next,station=station
,train=train,arrive_time=arrive_time,depart_time=depart_time,distance=distance,is_state=is_state)
print('{}次列车信息存入成功!'.format(train_size))
# 获取车表
def train():
# 获取车站信息字典
info = []
errors = []
station_list = models.Station.objects.all().values('station_name','id')
station_dic = {city.get('station_name'): city.get('id') for city in station_list}
# 打开车次信息文件
with open('train_list.js', 'rb') as f:
data = json.loads(f.read()).get('2019-07-16')
# 把所有列车信息转换为字典格式{车次:车次编号}
data_list = []
for val in data.values():
for vals in val:
data_list.append(vals)
data_dic = {dic.get('station_train_code').split('(')[0]: dic.get('train_no') for dic in data_list}
for i,(val,key) in enumerate(data_dic.items(),1):
# 循环爬取数据
print('正在爬第{}趟{}列车..'.format(i,val))
train_obj = models.Train.objects.filter(train_size=val).first()
# 判断列车是否已存在
if not train_obj:
res_dic = pa(station_dic,val)
# 存入数据库
if res_dic:
res_dic['train_size'] = val
res_dic['train_coding'] = key
train_save(res_dic)
else:
print('正在爬第{}趟{}列车数据报错!'.format(i, val))
train_obj = models.Train.objects.filter(train_size=val).delete()
errors.append(val)
else:
print('正在爬第{}趟{}列车数据已存在!'.format(i, val))
info.append(val)
print('已存在列车数据:',info)
print('不存在列车数据:',errors)
# 添加座位表
def seat():
# 查询到所有城市列表
train_obj = models.Train.objects.all()
train_list = models.Train.objects.annotate(a = Count('station2train__train')).values('train_size','a')
res_dic = {train.get('train_size'): train.get('a') for train in train_list}
id = 0
for index,train in enumerate(train_obj,1):
# 查询每列车有多少站点
train_size = train.train_size
print(index,train_size,res_dic.get(train_size))
sell = ''.join(['1' for i in range(res_dic.get(train_size))])
print(sell)
# Seat_list列表
query_list = []
# 7节车厢
for coach_size in range(1,8):
# 100座位
for seat_size in range(1,101):
id+=1
Seat_obj = models.Seat(id=id,train=train, coach_size=coach_size,seat_type=coach_size,seat_size=seat_size,is_sell=sell)
query_list.append(Seat_obj)
try:
models.Seat.objects.bulk_create(query_list)
print('座位列表添加成功{}条!'.format(id))
except:
print('座位列表{}已经存在!'.format(id))
def get_ip():
import pymysql
host = '106.75.31.89'
user = 'root'
password = 'Aa428912'
data = 'Ip_conn'
port = 3306
connect = pymysql.connect(host, user, password, data, port, charset='utf8') # 数据库连接参数
cursor = connect.cursor(pymysql.cursors.DictCursor) # 获取一个游标
cursor.execute('select ip from ip')
data = cursor.fetchall()
data_list = []
for i in data:
data_list.append(i['ip'])
print(data_list)
print(len(data_list))
return data_list
# 爬取距离
def pa_distance(station_name,station_name_1,ip):
print(ip)
proxies = {
"http": "http://{}".format(ip),
"https": "http://{}".format(ip),
}
url = 'http://juli.liecheshike.com/从{}到{}有多远'.format(station_name,station_name_1,proxies=proxies)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate',
'Cookie': 'safedog-flow-item=9C74D71A66F8B17A458732499BAEC7FF; ASPSESSIONIDCADCQTDB=JAMEOIKBLEPLECFDNHNCADNH; __51cke__=; ASPSESSIONIDCABBRQAC=KAGHCJNAIICFAABHNEOKPLIA; __tins__1516098=%7B%22sid%22%3A%201562545736853%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201562547696493%7D; __51laig__=14',
'Host': 'juli.liecheshike.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
requests.Session()
r = requests.get(url=url,headers=headers,timeout=30)
selector = etree.HTML(r.text)
res = selector.xpath("//h3/text()")
return res[0].split('公里')[0]
# 距离
def get_distance(data):
"""
逻辑:先查出列车数量,再去循环数量的次数,按照数量为列车ID去查找数据,再去爬取距离,存入数据库
:return:
"""
# 获取到车次数量
def get(i,sum = 0):
train_obj = models.Station2Train.objects.filter(train_id=i).values_list('station__station_name','pk')
station_name=""
distances = 0
print('——————第{}站——————'.format(i))
for index,train_data in enumerate(train_obj,1):
sum +=1
print('第',i,'的',sum,'个')
if index>1:
station_name_1=train_data[0]
while True:
# try:
distance = pa_distance(station_name,station_name_1,ip=random.choice(data))
break
# except Exception as e:
# if e=='list index out of range':
# distance=1
# break
# print('第', i, '的', sum, '个错误:%s'%e)
distances += int(distance)
print(distances)
models.Station2Train.objects.filter(train_id=i,station_next=index).update(distance=distances)
station_name = station_name_1
else:
station_name = train_data[0]
models.Station2Train.objects.filter(train_id=i,station_next=index).update(distance='0')
res = time.time()
sumber = models.Train.objects.all().count()
TP = ThreadPoolExecutor(max_workers=1)
for i in range(45,sumber+1):
# get(i)
TP.submit(get,i)
print(sumber)
print(time.time()-res)
if __name__ == '__main__':
pass
Get_IP.zhimaruanjian()
city() # 添加城市
station() # 添加车站
train() # 爬取列车表
seat() # 添加座位表
data = get_ip()
get_distance(data)
额外文件 (博客园无法上传大文件,给个外链接)
文件列表:
火车站信息表.xlsx
train_list.js
下载地址:
小强云盘分享链接:http://www.liqianglog.top:8002/home/share_link/K6X8028O08 提取密码为:353C 点击分享快去分享给好友啵~~
(如果失效,请联系博主,1206709430@qq.com)