zoukankan      html  css  js  c++  java
  • 4. 12306火车购票系统 (数据制作)

    使用环境

    文件目录结构

    文件目录结构

    数据表结构 models.py

    from django.db import models
    from django.contrib.auth.models import AbstractUser
    import time
    """
    学习到的知识:
    1) 一个表需要关联多次同一个表时,需要重命名正向查询(related_name)和反向查询明字(related_query_name)
    2) 索引的使用:db_index=True
    
    """
    
    
    # Create your models here.
    
    # 用户表
    class UserInfo(AbstractUser):
        gender = models.CharField('性别', max_length=32)  # 性别
        phone = models.CharField('手机号', max_length=32)  # 手机号
        name = models.CharField('姓名',max_length=32) # 姓名
        ID_number = models.CharField('身份证号', max_length=32)  # 身份证号
    
        def __str__(self):
            return str(self.pk)+'----'+str(self.username)
        class Meta:
            verbose_name_plural = '用户表'
    
    
    
    
    # 车站表
    class Station(models.Model):
        id = models.IntegerField(primary_key=True)
        station_name = models.CharField('车站名称', max_length=32)  # 车站名称
        english = models.CharField('英文编码', max_length=32, db_index=True)  # 英文编码
        spell = models.CharField('拼音', max_length=32)  # 拼音
        spell_brief = models.CharField('拼音简', max_length=32)  # 拼音简
        city = models.ForeignKey(verbose_name='关联城市', to='City', db_index=True)  # 关联城市
    
        def __str__(self):
            return str(self.station_name)
    
        class Meta:
            verbose_name_plural = '车站表'
    
    
    # 城市
    class City(models.Model):
        city_name = models.CharField('城市列表', max_length=32)  # 城市列表
    
        def __str__(self):
            return str(self.city_name)
        class Meta:
            verbose_name_plural = '城市'
    
    # 列车表
    class Train(models.Model):
        train_size = models.CharField('列车号', max_length=32, db_index=True)  # 列车号
        train_coding = models.CharField('列车编码', max_length=32)  # 列车编码
        # 一个表需要关联多次同一个表时,需要重命名正向查询(related_name)和反向查询明字(related_query_name)
        start_stand = models.ForeignKey(verbose_name='起始站', to='Station', related_name='related_start_stand')  # 起始站
        terminus = models.ForeignKey(verbose_name='终点站', to='Station', related_name='related_terminus')  # 终点站
        depart = models.CharField('始发时间', max_length=32)  # 始发时间
        arrive = models.CharField('到达时间', max_length=32)  # 到达时间
        coach_num = models.CharField('车厢数', max_length=32, default=7)  # 车厢数
        station = models.ManyToManyField(verbose_name='关联列车进站时间表', to='Station', through='Station2Train',
                                         through_fields=('train', 'station'))
    
        def __str__(self):
            return str(self.train_size)
        class Meta:
            verbose_name_plural = '列车表'
    
    # 列车进站时间表 车站——列车多对多表
    class Station2Train(models.Model):
        station = models.ForeignKey(verbose_name='关联车站表', to='Station')  # 关联车站表
        train = models.ForeignKey(verbose_name='关联列车表', to='Train')  # 关联列车表
        station_next = models.CharField('站次(这趟车第几次经过)', max_length=32)  # 站次(这趟车第几次经过)
        arrive_time = models.CharField('到达时间', max_length=32)  # 到达时间
        depart_time = models.CharField('出发时间', max_length=32)  # 出发时间
        distance = models.CharField('和上一站的距离', max_length=32)  # 和上一站的距离
        is_state = models.CharField('是起终停', max_length=32)  # 是起终停
    
        def __str__(self):
            return 'station' + '的到达时间:' + str(self.arrive_time) + '  出发时间' + str(self.depart_time)
    
        class Meta:
            unique_together = ("station", "train")
            verbose_name_plural = '列车进站时间表'
    
    
    
    # 车座表
    class Seat(models.Model):
        choices = (
            (1, '商务座'),
            (2, '一等座'),
            (3, '二等座'),
            (4, '高级软卧'),
            (5, '高级硬卧'),
            (6, '硬座'),
            (7, '无座')
        )
        train = models.ForeignKey(verbose_name='关联列车表', to='Train',null=True,db_index=True)  # 关联列车表
        coach_size = models.CharField('车厢号', max_length=32)  # 车厢号
        seat_type = models.IntegerField('座位类型', choices=choices)  # 座位类型
        seat_size = models.CharField('座位号', max_length=32)  # 座位号
        is_sell = models.CharField('出售情况', max_length=64, null=True,db_index=True)  # 出售情况
    
        def __str__(self):
            return str(self.seat_type)
        class Meta:
            verbose_name_plural = '车座表'
    
    
    
    # 邮箱验证码
    class EmailVerifyRecord(models.Model):  # 邮箱验证码
        code = models.CharField(max_length=20, verbose_name=u"验证码")
        email = models.EmailField(max_length=50, verbose_name=u"邮箱")
        send_type = models.CharField(choices=(('register', u"注册"), ('forget', u"找回密码")), max_length=10)
        send_time = models.DateTimeField(auto_now=True) # 获取时间
    
        class Meta:
            verbose_name = u"邮箱验证码"
            verbose_name_plural = verbose_name
    
    
    
    # 车票表
    class Ticket(models.Model):
        ticket = models.CharField('车票号', max_length=32, primary_key=True)  # 车票号
        train_size = models.ForeignKey(verbose_name='关联列车号', to='Train')  # 列车号
        coach_size = models.CharField('车厢号', max_length=32)  # 车厢号
        seat_size = models.CharField('座位号', max_length=32)  # 座位号
        user = models.ForeignKey(verbose_name='关联用户表', to='UserInfo')  # 关联用户表
        price = models.CharField('价格', max_length=32)  # 价格
        pay_type = models.CharField('支付方式', max_length=32)  # 支付方式
        depart_time = models.CharField('出发时间', max_length=32)  # 出发时间
        arrive_time = models.CharField('到达时间', max_length=32)  # 到达时间
        depart_stand = models.CharField('出发站', max_length=32)  # 出发站
        arrive_stand = models.CharField('到达站', max_length=32)  # 到达站
        buy_time = models.CharField('购买时间', max_length=32)  # 购买时间
        is_quit = models.CharField('是否退票', max_length=32)  # 是否退票
    
        def __str__(self):
            return str(self.ticket)
        class Meta:
            verbose_name_plural = '车票表'
    
    
    # 爬虫IP表
    class IP(models.Model):
        ip = models.CharField('IP地址', max_length=32)
        port = models.CharField('端口号', max_length=32)
        expire_time = models.CharField('过期时间', max_length=32)
        city = models.CharField('地区', max_length=32)
    
        def __str__(self):
            return str(self.ip) + ':' + str(self.port)
        class Meta:
            verbose_name_plural = '爬虫IP表'
    
    

    爬取所需代理ip ip.py

    (爬取可能失效 2019-7-4,自己可以先学一下request)

    import requests
    
    from app01 import models
    
    
    class Get_IP():
        # 校验
        def __init__(self):
            pass
        def select_ip(self):
            ip_obj = models.IP.objects.filter(id=1).first()
            return ip_obj
        def zhimaruanjian(self,url=None):
    
            """
            http://webapi.http.zhimacangku.com/getip?
            使用的芝麻代理 http://webapi.http.zhimacangku.com
            """
    
            requests.session()
            if not url:
                url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&pack=自己的pack号&ts=1&ys=0&cs=1&lb=1&sb=0&pb=4&mr=1&regions='
    
            r = requests.get(url)
            res = r.json()
            if res['code'] == 0:
                ip = res['data'][0]['ip']
                port = res['data'][0]['port']
                expire_time = res['data'][0]['expire_time']
                city = res['data'][0]['city']
                res = models.IP.objects.filter(id=1).first()
                print('获取到新ip %s'%(str(ip) +':'+ str(port)))
                if res:
                    models.IP.objects.filter(id=1).update(ip=ip, port=port, expire_time=expire_time, city=city)
                else:
                    res = models.IP(ip=ip, port=port, expire_time=expire_time, city=city)
                    res.save()
    
            else:
                return True
    if __name__ == '__main__':
        res = Get_IP()
        res.zhimaruanjian()
    

    数据的定制爬取

    (截止2019-7-4,数据统计:爬取车站总数2863个,涉及城市1260个,车站停靠数86037个,自制座位数5244727条( 车站停靠数86037个 X 列车数7节 X 每车厢100座位))

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    import os
    
    from django.core.wsgi import get_wsgi_application
    
    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cp12306.settings")
    
    application = get_wsgi_application()
    
    from app01 import models
    import xlrd
    import json
    import time
    import requests
    import random
    from lxml import etree
    from django.db.models import Count
    from app01.myfile.ip import Get_IP
    from concurrent.futures import ThreadPoolExecutor  # 设置多线程池
    Get_IP = Get_IP()
    """
    用到是知识点:
    1) bulk_create 批量添加数据
    2) xlrd 操作Excel表
    3)  list(set(city_name_list)) 列表 利用set的自动去重功能
    4) 分组和聚合函数使用:
            train_list = models.Train.objects.annotate(a = Count('station2train__train')).values('train_size','a')
    
    """
    
    
    # 添加城市
    def city():
        data = xlrd.open_workbook('火车站信息表.xlsx')  # 打开Excel表
        city_name_list = data.sheets()[0].col_values(5)  # 获取需要的数据
        query_list = []
        for x, i in enumerate(list(set(city_name_list)), 1):
            City_obj = models.City(id=x, city_name=i)
            if not City_obj in query_list:
                query_list.append(City_obj)
        try:
            print('城市列表添加完成!')
            models.City.objects.bulk_create(query_list)
        except:
            print('城市列表已经存在!')
    
    
    # 添加车站
    def station():
        # 查询城市代码id
        city_list = models.City.objects.all().values('id', 'city_name')
        city_dic = {city.get('city_name'): city.get('id') for city in city_list}
    
        # 添加
        data = xlrd.open_workbook('火车站信息表.xlsx').sheets()[0]  # 打开Excel表
        query_list = []
        for i in range(data.nrows):
            data_col = data.row_values(i)  # 获取excel一行数据
            station_obj = models.Station(id=i + 1, station_name=data_col[1], english=data_col[2], spell=data_col[3],
                                         spell_brief=data_col[4], city_id=city_dic.get(data_col[5]))
    
            query_list.append(station_obj)  # 把所有对象,添加到列表中
    
        try:
            pass
            models.Station.objects.bulk_create(query_list)
            print('车站列表添加完成!')
        except:
            print('车站列表已经存在!')
    
    #
    # # 添加座位类型
    # def seat_type():
    #     seat_list = ['商务座', '一等座', '二等座', '高级软卧', '高级硬卧', '硬座', '无座']
    #     query_list = []
    #     for id, seat_type in enumerate(seat_list, 1):
    #         query_list.append(models.Seat_Type(id=id, seat_type=seat_type))
    #
    #     # 批量插入数据库之bulk_create()
    #     try:
    #         models.Seat_Type.objects.bulk_create(query_list)
    #         print('座位类型添加完成!')
    #     except:
    #         print('座位类型已经存在!')
    
    # 爬取列车数据
    def pa(station_dic,train_size):
        """
        需要的数据:
            本列车
                起始站、
                终点站、
                始发时间、
                到达时间、
                站次、
                途径站中到达时间、
                途径站中出发时间、
                到达站、
                和上一站距离、
                状态是起终停
        :param train_size:
        :param train_coding:
        :return:
        """
        # 代理ip 地址,随机IP地址
    
        def get_ip():
            ip_obj = Get_IP.select_ip()
            ip = ip_obj.ip + ":" + ip_obj.port
            proxies = {
                'http': ip,
                'https': ip
            }
            print(proxies)
            return proxies
    
        url = 'http://checi.114piaowu.com/{}'.format(train_size)
        requests.Session()
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding': 'gzip, deflate',
            'Cookie': 'CLIENT_SOURCE=baidu_www.baidu.com; CLIENT_FIRST_ENTER=pc_shike; tostation=%E5%88%B0%E8%BE%BE%E5%9F%8E%E5%B8%82; UM_distinctid=16bb7da72431fe-07a5e239d1d8e6-37677e02-1aeaa0-16bb7da7244977; JSESSIONID=DF0894D3C3B6127C656BF6ADF714674E; fromstation=%E9%98%BF%E5%B0%94%E5%B1%B1; CLIENT_LAST_ENTER=pc_checi',
            'Host': 'checi.114piaowu.com',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        def get_url():
            r = requests.get(url=url, headers=headers, proxies=get_ip(), timeout=10)
            print(r.status_code)
            return r
        try:
            try:
                try:
                    try:
                        r = get_url()
                    except requests.exceptions.ConnectTimeout or requests.exceptions.ProxyError: # 代理无效错误
                        print('代理ip无效')
                        Get_IP.zhimaruanjian()
                        return False
                except requests.exceptions.ReadTimeout:# 读取超时错误
                    print('读取超时')
                    return False
            except requests.exceptions.TooManyRedirects:
                print('ip被限制')
                try:
                    Get_IP.zhimaruanjian()
                    r = get_url()
                except requests.exceptions.ProxyError:
                    try:
                        Get_IP.zhimaruanjian()
                        r = get_url()
                    except:
                        print('未知错误!')
                        return False
        except :
            print('未知错误!')
            Get_IP.zhimaruanjian()
            return False
        # 对获取的页面进行解析
        selector = etree.HTML(r.text)
        func_dic = {}
    
        # 把需要的数据写入字典
        try:
            res = selector.xpath("//dd[@class='line']/ul/li/a/text()")
            func_dic['start_stand'] = station_dic[res[0]] # 起始站
            func_dic['terminus'] = station_dic[res[1]]  # 终点站
            res = selector.xpath("//dd[@class='line']/ul/li/text()")
            func_dic['depart'] = res[0]  # 始发时间
            func_dic['arrive'] = res[1].split('(')[0]  # 到达时间
            res = selector.xpath("//div[@class='list']/table//tr")
            func_dic['data'] = {}
        except:
            return False
        number = 0
        # 把数据进行封装  格式 func_dic = {'start_stand':start_stand, ... 'data':{id:[]}}
        for each in res:
            numbers = each.xpath("./td[5]/text()")
            if numbers:
                numbers = numbers[0]
                # 获取车站代码对应的id
                try:
                    station_next = each.xpath("./td[1]/text()")[0]  # 站次
                    station = station_dic[each.xpath("./td[2]/a/text()")[0]]  # 到达站
                    arrive_time = each.xpath("./td[3]/text()")[0]  # 到达时间
                    depart_time = each.xpath("./td[4]/text()")[0]  # 出发时间
                    if numbers == '--':
                        numbers = 0
                    # func_list.append()
                    distance = int(numbers)  # 和上一站的距离
                    # distance = int(numbers) - int(number)  # 和上一站的距离
                    # number = numbers
                    if station == func_dic['terminus']:
                        is_state = '终'  # 是起终停
                    elif each.xpath("./td[3]/text()")[0] == '--':
                        is_state = '起'  # 是起终停
                    else:
                        is_state = '暂'  # 是起终停
                except KeyError:
                    return False
                func_dic['data'][station_next] = [station, arrive_time, depart_time, distance, is_state]
        return func_dic
    
    # 保存车列表
    def train_save(dic):
        train_size      = dic.get('train_size') #列车号
        train_coding    = dic.get('train_coding') #列车编码
        start_stand      = models.Station.objects.filter(id=dic.get('start_stand')).first()  #起始站
        terminus      = models.Station.objects.filter(id=dic.get('terminus')).first() #终点站
        depart      = dic.get('depart') #始发时间
        arrive      = dic.get('arrive') #到达时间
        train_obj = models.Train.objects.create(train_size=train_size,train_coding=train_coding,
                    start_stand=start_stand,terminus=terminus,depart=depart,arrive=arrive )
        data      = dic.get('data') #多对多数据
        for key,val in data.items():
            station_next      = key #站次
            station      = models.Station.objects.filter(id=val[0]).first()  #起始站 #关联列车表
            train      = train_obj #关联车站表
            arrive_time = val[1] #到达时间
            depart_time = val[2] #出发时间
            distance = val[3] #和上一站的距离
            is_state = val[4] #是起终停
            models.Station2Train.objects.create(station_next=station_next,station=station
                        ,train=train,arrive_time=arrive_time,depart_time=depart_time,distance=distance,is_state=is_state)
        print('{}次列车信息存入成功!'.format(train_size))
    
    
    
    # 获取车表
    def train():
        # 获取车站信息字典
        info = []
        errors = []
        station_list = models.Station.objects.all().values('station_name','id')
        station_dic = {city.get('station_name'): city.get('id') for city in station_list}
        # 打开车次信息文件
        with open('train_list.js', 'rb') as f:
            data = json.loads(f.read()).get('2019-07-16')
            # 把所有列车信息转换为字典格式{车次:车次编号}
            data_list = []
            for val in data.values():
                for vals in val:
                    data_list.append(vals)
    
            data_dic = {dic.get('station_train_code').split('(')[0]: dic.get('train_no') for dic in data_list}
    
        for i,(val,key) in enumerate(data_dic.items(),1):
            # 循环爬取数据
            print('正在爬第{}趟{}列车..'.format(i,val))
            train_obj = models.Train.objects.filter(train_size=val).first()
            # 判断列车是否已存在
            if not train_obj:
    
                    res_dic = pa(station_dic,val)
                    # 存入数据库
                    if res_dic:
                        res_dic['train_size'] = val
                        res_dic['train_coding'] = key
    
                        train_save(res_dic)
                    else:
                        print('正在爬第{}趟{}列车数据报错!'.format(i, val))
                        train_obj = models.Train.objects.filter(train_size=val).delete()
    
                        errors.append(val)
            else:
                print('正在爬第{}趟{}列车数据已存在!'.format(i, val))
                info.append(val)
        print('已存在列车数据:',info)
        print('不存在列车数据:',errors)
    
    # 添加座位表
    def seat():
        # 查询到所有城市列表
        train_obj = models.Train.objects.all()
    
        train_list = models.Train.objects.annotate(a = Count('station2train__train')).values('train_size','a')
        res_dic = {train.get('train_size'): train.get('a') for train in train_list}
        id = 0
        for index,train in enumerate(train_obj,1):
            # 查询每列车有多少站点
            train_size = train.train_size
            print(index,train_size,res_dic.get(train_size))
            sell = ''.join(['1' for i in range(res_dic.get(train_size))])
            print(sell)
            # Seat_list列表
            query_list = []
            # 7节车厢
            for coach_size in range(1,8):
                # 100座位
                for seat_size in range(1,101):
                    id+=1
                    Seat_obj = models.Seat(id=id,train=train, coach_size=coach_size,seat_type=coach_size,seat_size=seat_size,is_sell=sell)
                    query_list.append(Seat_obj)
            try:
                models.Seat.objects.bulk_create(query_list)
                print('座位列表添加成功{}条!'.format(id))
            except:
                print('座位列表{}已经存在!'.format(id))
    
    def get_ip():
        import pymysql
        host = '106.75.31.89'
        user = 'root'
        password = 'Aa428912'
        data = 'Ip_conn'
        port = 3306
    
        connect = pymysql.connect(host, user, password, data, port, charset='utf8')  # 数据库连接参数
        cursor = connect.cursor(pymysql.cursors.DictCursor)  # 获取一个游标
        cursor.execute('select ip from ip')
        data = cursor.fetchall()
        data_list = []
        for i in data:
            data_list.append(i['ip'])
        print(data_list)
        print(len(data_list))
    
        return data_list
    
    # 爬取距离
    def pa_distance(station_name,station_name_1,ip):
        print(ip)
        proxies = {
            "http": "http://{}".format(ip),
            "https": "http://{}".format(ip),
        }
    
        url = 'http://juli.liecheshike.com/从{}到{}有多远'.format(station_name,station_name_1,proxies=proxies)
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding': 'gzip, deflate',
            'Cookie': 'safedog-flow-item=9C74D71A66F8B17A458732499BAEC7FF; ASPSESSIONIDCADCQTDB=JAMEOIKBLEPLECFDNHNCADNH; __51cke__=; ASPSESSIONIDCABBRQAC=KAGHCJNAIICFAABHNEOKPLIA; __tins__1516098=%7B%22sid%22%3A%201562545736853%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201562547696493%7D; __51laig__=14',
            'Host': 'juli.liecheshike.com',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
        }
        requests.Session()
        r = requests.get(url=url,headers=headers,timeout=30)
        selector = etree.HTML(r.text)
        res = selector.xpath("//h3/text()")
        return res[0].split('公里')[0]
    
    
    # 距离
    def get_distance(data):
        """
        逻辑:先查出列车数量,再去循环数量的次数,按照数量为列车ID去查找数据,再去爬取距离,存入数据库
        :return:
        """
        # 获取到车次数量
        def get(i,sum = 0):
    
            train_obj = models.Station2Train.objects.filter(train_id=i).values_list('station__station_name','pk')
            station_name=""
            distances = 0
            print('——————第{}站——————'.format(i))
            for index,train_data in enumerate(train_obj,1):
                sum +=1
                print('第',i,'的',sum,'个')
                if index>1:
                    station_name_1=train_data[0]
                    while True:
                        # try:
                        distance = pa_distance(station_name,station_name_1,ip=random.choice(data))
                        break
                        # except Exception as e:
                        #     if e=='list index out of range':
                        #         distance=1
                        #         break
                        #     print('第', i, '的', sum, '个错误:%s'%e)
    
                    distances += int(distance)
                    print(distances)
                    models.Station2Train.objects.filter(train_id=i,station_next=index).update(distance=distances)
                    station_name = station_name_1
                else:
                    station_name = train_data[0]
                    models.Station2Train.objects.filter(train_id=i,station_next=index).update(distance='0')
    
        res = time.time()
        sumber = models.Train.objects.all().count()
        TP = ThreadPoolExecutor(max_workers=1)
        for i in range(45,sumber+1):
            # get(i)
            TP.submit(get,i)
    
    
        print(sumber)
        print(time.time()-res)
    if __name__ == '__main__':
        pass
        Get_IP.zhimaruanjian()
    
        city()  # 添加城市
        station()  # 添加车站
        train()  # 爬取列车表
        seat()  # 添加座位表
        data = get_ip()
        get_distance(data)
    
    
    

    额外文件 (博客园无法上传大文件,给个外链接)

    文件列表:
    	火车站信息表.xlsx
      train_list.js
    下载地址:
    		小强云盘分享链接:http://www.liqianglog.top:8002/home/share_link/K6X8028O08 提取密码为:353C 点击分享快去分享给好友啵~~
    

    (如果失效,请联系博主,1206709430@qq.com)

  • 相关阅读:
    Linux下搭建DNS服务器
    Linux下安装Oracle客户端
    CentOS下配置LVM和RAID
    Linux下配置MySQL主从复制
    Linux下二进制文件安装MySQL
    不偏移的天地图地图服务
    Lucene
    Arcpy处理修改shapefile FeatureClass 线要素坐标
    使用ArcGIS实现WGS84经纬度坐标到北京54高斯投影坐标的转换
    ArcGIS自定义坐标变换中的方法说明
  • 原文地址:https://www.cnblogs.com/liqianglog/p/11134687.html
Copyright © 2011-2022 走看看