zoukankan      html  css  js  c++  java
  • 第一个爬虫小项目2109.6.1

    爬取网址为:http://xuexiao.51sxue.com/slist/?o=&t=3&areaCodeS=&level=&sp=&score=&order=&areaS=%C8%AB%B9%FA&searchKey=

    爬取初中跟高中页面的数据 

    具体字段为 学校名字 地区及学校介绍

    然后拿着地区与mjp_district_city表作比较 地区如果表里没有设置为0 有的话设置市前面的id存在sceool表里, 代码如下

    import requests
    from lxml import etree
    import time
    import pymysql
    import time
    import random
    db = pymysql.connect(host='192.168.1.200',port=3306, user='app_mjp', password='app_mjp', db='app_mjp')
    
    cursor = db.cursor()
    
    # cursor.execute(sql)
    # db.commit()
    # data1 = cursor.fetchall()
    
    
    def get_url():
        lst1 = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36 ',
            'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
        ]
        numbo=0
        for num in range(1,1334):
            numbo+=1
            url = "http://xuexiao.51sxue.com/slist/?o=&t=2&areaCodeS=&level=&sp=&score=&order=&areaS=%C8%AB%B9%FA&searchKey="+"&page="+str(num)
            response = requests.get(url=url,headers={"User-Agent":random.choice(lst1)}).content
            etee = etree.HTML(response)
            title=etee.xpath('//div[@class="school_m_main fl"]/li/h3//text()')
            new_url=etee.xpath('//div[@class="school_m_main fl"]/li/h3//@href')
            diqu=etee.xpath('//div[@class="school_m_main fl"]/li[2]/b//text()')
            xuexiaojieshao=get_jianjie(new_url)
    
            #已拿到所有的数据
            data=zip(title,diqu,xuexiaojieshao)
    
            # 城市表
            for i in data:
                # print(i[1][-4:])
                sql = 'select da_id from mjp_district_area where da_name="{}"'.format(i[1][-4:].strip())
                # sql = 'select da_id from mjp_district_area where da_name="{}"'.format("上城区")
                cursor.execute(sql)
                # 事务提交,否则数据库得不到更新
                db.commit()
                rs=cursor.fetchone() # 将所有的结果放入rr中
                #拿到每个地区的编号
                #使用zip 封装到一起
                if rs is None:
                    rs="0"
                print(i[0],str(rs),i[2],2)
                sql_insert = "INSERT INTO mjp_school(s_name,s_da_id,s_type,s_desc,s_creat_time) VALUES(%s,%s,%s,%s,%s)"
                # 执行语句
                try:
                    # 执行sql语句
                    # cursor.execute(sql_insert, (i[0],rs,2))
                    cursor.execute(sql_insert, (i[0],rs,2,i[2],time.time()))
                    db.commit()
                except Exception as ss:
                    print(ss)
                    print(2)
                    # 如果发生错误则回滚
                    db.rollback()
            print("第%s爬取完成" % numbo)
    
    def get_jianjie(new_url):
        lst=[]
        lst1 = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Mozilla/5.0?(Windows?NT?10.0;?Win64;?x64)?AppleWebKit/537.36?(KHTML,?like?Gecko)?Chrome/73.0.3683.103?Safari/537.36 ',
            'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36',
        ]
        for u in new_url:
            url = u.replace("detail","content")
            res = requests.get(url=url, headers={"User-Agent":random.choice(lst1)})
            res.encoding = "gbk"
            tree = etree.HTML(res.text)
            p = "".join(
                [i.replace("u3000u3000", "").strip() for i in tree.xpath('//*[@id="nr_main"]/div[1]/div[2]/p//text()')]).replace("xa0 xa0 xa0","")
            lst.append(p)
        return lst
    
    
    if __name__ == '__main__':
        get_url()

    心得: 很简单的一个小项目 搞了一天

    具体困难: 1 需要连接两张表 担心会出问题 

          2 取出来的地区 前面带了个空格,导致匹配地区拿到的是none,

          3没用过多线程,后续加强练习多线程

  • 相关阅读:
    c#字符串练习
    ASP.NET自定义错误
    xml学习
    xml学习二
    Jquery 正则表达式学习
    c#文件流操作
    GridView从入门到精
    GridView学习,常用记下来
    PowerShell Commands for SharePoint 2007 PowerShell Commands for SharePoint 2007
    Sharepoint自带的批量签入功能,很多人不知道,需要的时候可以看一下
  • 原文地址:https://www.cnblogs.com/daien522556/p/10968368.html
Copyright © 2011-2022 走看看