zoukankan      html  css  js  c++  java
  • 爬取酒店信息

    import requests
    from lxml import etree
    import re
    import xlwt
    import pyodbc
    import random
    
    class Hotel():
    
        #初始化
        def __init__(self):
    
            self.headers = [
                "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
                "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
                "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
                "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
                "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)',
                'UCWEB7.0.2.37/28/999',
                'NOKIA5700/ UCWEB7.0.2.37/28/999',
                'Openwave/ UCWEB7.0.2.37/28/999',
                'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
                'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; 360SE)',
                'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
                'Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
                'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
                'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0; .NET CLR 2.0.50727)',
                'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
                'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
                'Mozilla/5.0 (Androdi; Linux armv7l; rv:5.0) Gecko/ Firefox/5.0 fennec/5.0',
                "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3192.0 Safari/537.36Name"]
            #创建一个excel表
            self.f = xlwt.Workbook(encoding='utf-8')
            #创建一个单表  sheet1, 在单表里面插入
            self.sheet1 = self.f.add_sheet(u'sheet1', cell_overwrite_ok=True)
    
            #数据库连接
            self.cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER=.;DATABASE=Test;UID=sa;PWD=123456')
            #游标
            self.cursor = self.cnxn.cursor()
    
    
        #访问get
        def get_html(self,jingji_url):
            # headers = {
            # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    
            #新建一个字典
            dic = {}
            #随机选择headers
            dic['User-Agent'] = random.choice(self.headers)
            for i in range(3):
                r = requests.get(jingji_url,headers=dic)
                #r = requests.get(jingji_url,headers=headers)
                if r.status_code == 200:
                    r.encoding = 'utf-8'
                    return r.text
                else:
                    pass
    
    
        def parse(self,html):
    
            #解析列表页面
            try:
                url_base = 'http://www.gckzw.com'
                html = etree.HTML(html)
    
                #每个酒店的url
                content_url = html.xpath('//div[@class="travel_left_content travel_left"]//div[@class="travel_hotel_list_content travel_celarfix"]//p[@class="travel_hotel_intro_title"]//a/@href')
                #每个酒店的名称
                content_name = html.xpath('//div[@class="travel_left_content travel_left"]//div[@class="travel_hotel_list_content travel_celarfix"]//p[@class="travel_hotel_intro_title"]//a/text()')
                #每个列表页面的详细url
                detail_url = []
                j = 0
                for i in content_url:
                    dic = {}
                    dic[content_name[j]] = url_base+ i
                    detail_url.append(dic)
                    j += 1
    
                return detail_url
            except:
                pass
    
        def detail_parse(self,html):
            #详细页面的解析
            try:
                #新的列表用来装东西
                list_end = []
                html = etree.HTML(html)
    
                #联系方式
                content_text = html.xpath('//div[@class="em2_bg clearfix"][1]//p/text()')[0]
    
                #位置
                location = html.xpath("//div[@class='hotel_comment_header travel_celarfix'][1]//p[2]/span/text()")[0]
    
                #插入数据
                list_end.append(content_text)
                list_end.append(location)
                print("联系方式:和位置",list_end)
                #返回数据
                return list_end
    
            except:
                pass
    
        def url_list(self):
    
            #每一页的Url
            list_page_list = []
            for i in range(1,160):
                url= 'http://www.gckzw.com/jiudian-xian610100-p'+ str(i) + '.html?startDate=2019-07-24&endDate=2019-07-25'
                list_page_list.append(url)
    
            return list_page_list
    
    
        def sql_connect(self,list_end):
            try:
                print("**********{} **{}***{}****{}*************".format(self.cursor,list_end[0],list_end[1],list_end[2]))
    
                #插入sql
                insert = self.cursor.execute("insert into jiudian_2(name,phone,adress) values (?,?,?)",(list_end[0],list_end[1],list_end[2])).rowcount
                print("insert",insert)
                self.cursor.commit()
            except:
                pass
    
    if __name__ == '__main__':
        news = Hotel()
        list_page_list = news.url_list()
        print("所有列表页面的url",list_page_list)
    
        try:
            #循环每一个列表页面
            for url in list_page_list:
                html = news.get_html(url)
                detail_url_list = news.parse(html)
                #print("每一个酒店的名称和url",detail_url)
    
                num = 0
                for deta_url in detail_url_list:
    
                    #酒店的url
                    detail_url = list(deta_url.values())[0]
    
                    #每个酒店的名称
                    name = list(deta_url.keys())[0]
                    #print("每个酒店的url",detail_url,name)
    
                    #访问每一个酒店的url
                    html = news.get_html(detail_url)
                    list_end = news.detail_parse(html)
                    list_end.append(name)
                    print("需要的信息",list_end)
    
                    #存入sql
                    news.sql_connect(list_end)
    
                    j = 0
                    for value in list_end:
                        news.sheet1.write(num, j, value)   #这三个参数分别是行、列、值)
                        j += 1
                    num += 1
        except Exception as e:
            print("错误原因",e)
        finally:
            news.cnxn.close()
            #news.cnxn.save(r'd:excel_jingjijiudian.xls')
  • 相关阅读:
    算法图解之散列表
    算法图解之快速排序
    算法图解之分而治之
    __setitem__,__getitem,__delitem__的作用
    算法图解之递归
    Python开发不可不知的虚拟环境
    静态属性property的本质和应用
    SQLmap详解
    windows提权备忘录
    linux提权备忘录
  • 原文地址:https://www.cnblogs.com/yuanjia8888/p/11361648.html
Copyright © 2011-2022 走看看