zoukankan      html  css  js  c++  java
  • Python--爬虫基础

    1、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 10:30:26 2019

    @author: Office
    """
    import urllib.request

    #需要爬取的网站
    url = "http://www.baidu.com/"

    #response:向指定的url地址发起请求,并返回http响应的数据(文件的对象)
    response = urllib.request.urlopen(url)

    #读取内容 bytes类型
    data = response.read()#读取文件的全部内容,会把读取到的数据赋值给一个字符串变量
    #data = response.readline()#读取一行,若要全部打印出来,需要写一个循环
    #data = response.readlines()#读取文件的全部内容,会把读取到的数据赋值给一个列表变量
    #print(data)
    #print(type(data))

    #将文件获取的内容转换成字符串
    str_data = data.decode("utf-8")
    #print(str_data)
    #print(type(str_data))

    #将爬取到的网页写入文件
    #第一种方法
    with open("baidu.html","w",encoding="utf-8")as f:#以str类型的方式写入文件
    f.write(str_data)

    #第二种方法,urlretrieve在执行的过程中,会残留一些缓存,需要进行清除缓存
    #urllib.request.urlretrieve(url,"baidu2.html")
    #urllib.request.urlcleanup(url,"baidu2.html") #清除缓存

    #response相关属性
    #print(response.info())#返回当前环境的有关信息
    #print(response.getcode())#返回状态码 只需记住200,304(客户端已经执行了get,但文件未变化,有缓存的意思),400(错误请求,如语法错误),500(服务器内部产生错误)
    #print(response.geturl())#返回当前正在爬取的URl地址

     2、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 15:09:34 2019

    @author: Office
    """
    import urllib.request
    url = "http://www.baidu.com/"
    #模拟请求头
    headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }

    #设置一个请求体
    req=urllib.request.Request(url,headers=headers)

    #发起请求
    response=urllib.request.urlopen(req)
    data=response.read().decode('utf-8')
    print(data)

     3、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 15:17:49 2019

    @author: Office
    """

    import urllib.request
    import random
    url = "http://www.baidu.com/"
    #模拟请求头
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    #随机选择一个请求头
    agentStr=random.choice(agentlist)
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    #设置一个请求体
    req=urllib.request.Request(url,headers=headers)

    #发起请求
    response=urllib.request.urlopen(req)
    data=response.read().decode('utf-8')
    #print(data)
    print(req.get_full_url()) #获取url地址
    print(req.get_header('User-agent')) #获取User-agent,第一个单词首字母大写,后面的单词首字母小写

    #第二种写法:
    url = "http://www.baidu.com/"
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    user_angent_list=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]

    end_user_angent=random.choice(user_angent_list)

    req=urllib.request.Request(url,headers=headers)
    req.add_header('User-Agent',end_user_angent)

    response=urllib.request.urlopen(req)
    data=response.read().decode('utf-8')
    print(data)

     4、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 16:10:42 2019

    @author: Office
    """
    import urllib.request
    url = "http://www.baidu.com/"
    #如果网页长时间未响应,系统判断超时,无法爬取
    for i in range(1,100):
    try:
    response=urllib.request.urlopen(url,timeout=0.2)
    print(len(response.read().decode('utf-8')))
    except:
    print("请求超时,继续下一个爬取")

     5、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 16:24:45 2019

    @author: Office
    """
    #http 使用场景:进行客户端与服务端之间的消息传递时使用
    #GET:通过url网址传递信息,可以直接在url网址上添加要传递的信息
    #POST:可以向服务器提交数据,是一种比较流行的比较安全的数据传递方式
    #PUT:请求服务器存储一个资源,通常要指定存储的位置
    #DELETE:请求服务器删除一个资源


    '''
    GET请求
    特点:把数据拼接到请求路径的后面传递给服务器

    优点:速度快

    缺点:承载的数据量小,不安全

    '''
    import urllib.request
    import urllib.parse
    import string
    import random

    #单个值约束
    #url='http://www.baidu.com/s?wd='
    #
    #wd='图片'
    #wd=urllib.parse.quote(wd)
    #end_url=url+wd
    #
    #headers={
    # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    # 'X-REQUESTED-With':'XMLHttpRequest',
    # 'Content-Type':'application/x-www-form-urlencoded'
    # }
    #
    #user_angent_list=[
    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    # ]
    #
    #end_user_angent=random.choice(user_angent_list)
    #
    #req=urllib.request.Request(end_url,headers=headers)
    #req.add_header('User-Agent',end_user_angent)
    #
    #response=urllib.request.urlopen(req)
    #data=response.read().decode('utf-8')
    #print(data)

    #多个值约束
    url='https://www.baidu.com/s?'
    da_ta={
    'wd':'风景',
    'key':'zhang',
    'value':'san'
    }
    final_da_ta=urllib.parse.urlencode(da_ta)

    final_url=url+final_da_ta

    end_url=urllib.parse.quote(final_url,safe=string.printable)
    print(end_url)
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    user_angent_list=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]

    end_user_angent=random.choice(user_angent_list)
    headers['User-Agent']=end_user_angent
    req=urllib.request.Request(end_url,headers=headers)
    response=urllib.request.urlopen(req)
    data=response.read().decode('utf-8')
    print(data)

     6、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Apr 28 16:50:51 2019

    @author: Office
    """
    '''
    POST 请求
    特点:把参数进行打包,单独传输

    优点:数量大,安全(当对服务器数据进行修改时建议使用post)

    缺点:速度慢
    '''

    import urllib.parse
    import urllib.request
    url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'

    headers={
    'Accept':'application/json, text/javascript, */*; q=0.01',
    'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
    'Referer':'http://fanyi.youdao.com/?keyfrom=dict2.index',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'X-Requested-With':'XMLHttpRequest'
    }
    #将要发送的数据合成一个字典
    #字典的键去网址里找,一般为input标签的name属性的值

    key=input("请输入你要翻译的内容:")
    data={
    'i' : key,
    'from' : 'AUTO',
    'to' : 'AUTO',
    'smartresult' : 'dict',
    'client' : 'fanyideskweb',
    'salt': '15564473252080',
    'sign': 'b6f44d14938df7391a28b66252a461aa',
    'doctype' : 'json',
    'version' : '2.1',
    'keyfrom' : 'fanyi.web',
    'action' : 'FY_BY_CLICKBUTTION'
    }
    #将要发送的数据进行打包,记住编码
    da_ta=urllib.parse.urlencode(data).encode('utf-8')
    #请求
    end_data=urllib.request.urlopen(url,da_ta).read().decode('utf-8')
    print(end_data)

     7、

    # -*- coding: utf-8 -*-
    """
    Created on Mon Apr 29 11:02:48 2019

    @author: Office
    """

    from bs4 import BeautifulSoup
    import urllib.request

    #转化本地文件
    soup = BeautifulSoup(open("soup_text.html",encoding="utf-8"),'lxml')

    #根据标签名查找
    #print(soup.a) 只能找到第一个符合要求的标签
    #print(soup.div)

    #获取属性
    #print(soup.a["href"])获取href属性
    #print(soup.a.attrs)获取属性和值,返回的是一个字典
    #print(soup.a.attrs["href"])也可以这样写

    #获取内容
    #print(soup.a.text)
    #print(soup.a.string)
    #print(soup.a.get_text())

    #三者的区别:如果标签中还有标签,那么string获取到的结果为None,而另外两个,可以获取文本内容
    #print(soup.div.text)
    #print(soup.div.string)
    #print(soup.div.get_text)
    #print(soup.div.get_text().split()[0])#获取里面元素

    #find 找到的都是第一个符合要求的标签
    #print(soup.find('a'))#找到第一个符合要求的a
    #print(soup.find('a',title="qin"))#通过第二条件title="qin"进行限制来查找
    #print(soup.find('a',class_="du"))#由于class是关键字,所以需要加一个下划线
    #print(soup.find('a',id="feng"))

    #find方法不仅可以适用于soup,普通的对象也可以适用,会去指定的普通对象里面去朝招符合要求的节点。
    #通过层级的方式,往下查找
    #div=soup.find('div',class_='tang')
    #print(div.find('a',alt="qi"))
    #print(div.find('a',class_="du"))#如果有两个相同,还是找到第一个符合要求的

    #find_all
    #lt=soup.find_all('a')#找所有a的标签
    #print(lt,len(lt))

    #div=soup.find('div',class_='tang')
    #print(div.find_all('a'))
    #print(div.find_all(['i','b']))#find_all 里面还可以接多个标签,以列表的形式
    #print(div.find_all('a',limit=2))#找到所有取前面2个

    #select 根据选择器选择指定的内容
    #常见的选择器:标签选择器,类选择器,id选择器,组合选择器,层级选择器,属性选择器
    #选择器返回的永远是列表,需要通过下表提取指定的对象,然后获取属性和节点
    #print(soup.select('div > u1 > li > a'))#标签和大于符号之间必须有空格
    #print(soup.select('div > u1 > li > a')[0])#取第一个
    #print(soup.select('.tang > u1 > li > a')[0])#也可以这样写,返回的结果和上面一样
    #print(soup.select('#du'))#id可以这样写
    #print(soup.select('#feng')[0].text)#返回的是一个列表,取值的话,必须先通过下表取出来,在调用获取内容的函数
    #print(soup.select('#feng')[0]['href'])#返回的是href的值

    #select 方法也可以通过普通对象调用,找到都是这个对象下面符合要求的所有节点
    #div=soup.find('div',class_='tang')
    #print(div.select('.du'))
    #print(soup.select('.du'))

     8、

    # -*- coding: utf-8 -*-
    """
    Created on Wed May 1 11:05:33 2019

    @author: admin
    """

    import urllib.request
    import urllib.parse
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context

    url='http://www.renren.com/970622703/profile'
    headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36',
    'Cookie':'anonymid=jv4jjsmt8luy21; ln_uact=17767258153; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; __guid=238633222.311801873786504100.1556674290342.3481; jebe_key=51ea37de-35c3-4754-82ed-2cc4fbe57341%7C0ff20ead6ae99fd72934c187b694b4f1%7C1556674288363%7C1%7C1556674291856; jebe_key=51ea37de-35c3-4754-82ed-2cc4fbe57341%7C0ff20ead6ae99fd72934c187b694b4f1%7C1556674288363%7C1%7C1556674291858; wp_fold=0; depovince=GW; _r01_=1; JSESSIONID=abcnRiMszrXoLbNlVdXPw; ick_login=4c390ed0-4fe6-4264-b9b2-610a614ac13c; first_login_flag=1; jebecookies=989247e8-b114-48f9-9592-aec3cd10e92b|||||; _de=7266BDD6184F288A5EF7AB01E3CFE338; p=38e98cbf34016e9010c9f1f73791f2423; t=3b04ed4095e7a4b7612203f7169bbc843; societyguester=3b04ed4095e7a4b7612203f7169bbc843; id=970622703; xnsid=8ebbfe1f; ver=7.0; loginfrom=null; monitor_count=9',
    }

    req=urllib.request.Request(url,headers=headers)
    response=urllib.request.urlopen(req)
    print(response.read().decode('utf-8'))

     9、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Jul 15 08:52:30 2018

    @author: T0001
    """

    html='''<tr>
    <td class="posterColumn">
    <span name="rk" data-value="1"></span>
    <span name="ir" data-value="9.216510839765467"></span>
    <span name="us" data-value="7.791552E11"></span>
    <span name="nv" data-value="1868842"></span>
    <span name="ur" data-value="-1.7834891602345326"></span>
    <div class="unseeable">NOT YET RELEASED</div>
    </td>
    <td class="titleColumn">
    1.
    <a href="/title/tt0111161" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman" >The Shawshank Redemption</a>
    <span class="secondaryInfo">(1994)</span>
    </td>
    <td class="ratingColumn imdbRating">
    <strong title="9.2 based on 1,868,842 user ratings">9.2</strong>
    </td>
    <td class="ratingColumn">
    <div class="seen-widget seen-widget-tt0111161 pending" data-titleid="tt0111161">
    <div class="inline">
    <div class="pending">3.2</div>
    <div class="unseeable">NOT YET RELEASED</div>
    <div class="unseen">4.5</div>
    <div class="rating"></div>
    <div class="seen">Seen</div>
    </div>
    </div>
    </td>
    <td class="watchlistColumn">
    <div class="wlb_ribbon" data-tconst="tt0111161" data-recordmetrics="true"></div>
    </td>
    </tr>
    '''

    from lxml import etree

    #本地打开
    #tree=etree.parse("文件名")

    #网络打开
    #tree=etree.HTML("网页字符串")

    imdb=etree.HTML(html)

    #属性定位
    #print(imdb.xpath('//span[@name="ir"]'))
    #print(imdb.xpath('//div[@data-tconst]'))

    #层级和索引混合定位
    #print(imdb.xpath('//div[@class="seen-widget seen-widget-tt0111161 pending"]/div/div[1]'))#索引从1开始
    #print(imdb.xpath('//div[@class="seen-widget seen-widget-tt0111161 pending"]/div/div[@class="unseeable"]'))#也可以通过属性定位

    #print(imdb.xpath('//td[@class="ratingColumn"]//div'))#<td class="ratingColumn">下面所有的div
    #print(imdb.xpath('//td[@class="ratingColumn"]//div[@class="seen"]'))#后面也可以用属性来定位

    #result1=imdb.xpath('//div[@class="inline"]/div[last()-2]')#


    #逻辑运算
    #print(imdb.xpath('//div[@class="wlb_ribbon"and @data-tconst="tt0111161"]')) 如果一个属性不能限制,也可以加一个属性,他们之间用and

    #模糊匹配
    #print(imdb.xpath('//div[contains(@class,"un")]'))#所有的div ,有class属性,并且属性中含有un的节点
    #print(imdb.xpath('//div[contains(text(),4)]'))#所有的值 ,含有4的节点
    #print(imdb.xpath('//div[starts-with(@class,"r")]'))#所有的div ,有class属性,并且属性中以r开头的节点

    #取文本内容
    #print(imdb.xpath('//div[@class="inline"]/div[5]/text()'))获取节点内容

    #取属性
    #print(imdb.xpath('//div[@class="inline"]/div[2]/@class'))

    #print(imdb.xpath('//div[@class="inline"]//text()'))#将<div class="inline">后面的所有节点里面不带标签的所有内容给取出来

    #print(imdb.xpath('//div[@class="inline"]/div[last()-1]/@class'))#也可以通过这种方式获取

    #将1.给取出来
    #s=imdb.xpath('//td[@class="titleColumn"]/text()')
    #a=[]
    #for i in s:
    # if i.strip() != "":
    # a.append(i.strip())

    #s=imdb.xpath('//td[@class="titleColumn"]')
    #k=s[0].xpath('string(.)')
    #l=k.replace(' ', '').replace(' ', '')
    #print(l.strip().split()[0])

    #for i in result:
    # print(etree.tostring(i))

     10、

    # -*- coding: utf-8 -*-
    """
    Created on Wed May 1 11:13:30 2019

    @author: admin
    """

    import urllib.request
    import urllib.parse


    url='http://www.baidu.com/'
    proxy={
    'http':'222.135.92.68:38094'
    }

    #创建handler
    handler=urllib.request.ProxyHandler(proxy)
    #创建opener
    opener=urllib.request.build_opener(handler)

    headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36'
    }
    req=urllib.request.Request(url,headers=headers)
    response=opener.open(req)
    print(response.read().decode('utf-8'))

     11、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Jul 15 11:37:22 2018

    @author: T0001
    """

    from lxml import etree
    import numpy as np
    import pandas as pd
    import urllib.request
    import random
    url='http://news.ceic.ac.cn/'
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    headers['User-Agent']=agentStr
    req=urllib.request.Request(url,headers=headers)
    response=urllib.request.urlopen(req).read().decode('utf-8')

    earth=etree.HTML(response)
    result=earth.xpath('//td[@align="center"]/text()')
    result1=earth.xpath('//td[@align="left"]/a/text()')

    data=np.array(result).reshape((-1,5))

    c=np.column_stack((data,result1))

    pd.DataFrame(c,columns=['gf','gdf','dsf','dsgf','fdg','dfgh']).to_csv('dz.csv',index=False)

     12、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 19:06:01 2019

    @author: Office
    """
    import urllib.request
    import ssl
    import random
    import json
    import pandas as pd
    from sqlalchemy import create_engine
    #模拟请求头
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    #随机选择一个请求头
    agentStr=random.choice(agentlist)
    def ajaxCrawler(url):
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    req=urllib.request.Request(url,headers=headers)
    # 使用ssl创建未验证的上下文
    context=ssl._create_unverified_context()
    response=urllib.request.urlopen(req,context=context)

    jsonStr=response.read().decode('utf-8')
    jsonData=json.loads(jsonStr)
    return jsonData

    title=[]
    score=[]
    release_date=[]
    vote_count=[]
    for i in range(1,100):
    url='https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start='+str(i*20)+'&limit=20'
    info=ajaxCrawler(url)
    for j in info:
    title.append(j["title"])
    score.append(j['score'])
    release_date.append(j['release_date'])
    vote_count.append(j['vote_count'])

    #转化为DataFrame
    data=pd.DataFrame({'score':score,'title':title,'release_date':release_date,'vote_count':vote_count},columns=['score','title','release_date','vote_count'])
    #保存到excel
    #data.to_csv('dy.csv')

    #保存到mysql
    engine=create_engine('mysql+pymysql://root:123456@localhost/demo')
    data.to_sql('douban',engine,if_exists="replace")

     13、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jun 13 20:12:39 2019

    @author: wqq
    """

    import urllib.request
    import re
    import random
    import gzip
    import numpy as np
    import pandas as pd
    url="http://esf.hz.fang.com/"
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    headers['User-Agent']=agentStr
    req=urllib.request.Request(url,headers=headers)

    response=urllib.request.urlopen(req)
    #print(response.info().get('Content-Encoding'))
    string=gzip.decompress(response.read()).decode('gbk')

    phone_reg=r'''<span class="red"><b>(.*?)</b>'''
    phone_pat=re.compile(phone_reg)
    z_jia=re.findall(phone_pat,string)

    phone_reg=r'''<span>(.*?)元/㎡</span>'''
    phone_pat=re.compile(phone_reg)
    d_jia=re.findall(phone_pat,string)


    phone_reg=r'''<p class="tel_shop">(.*?)<span class="people_name">'''
    phone_pat=re.compile(phone_reg,re.S)
    match=re.findall(phone_pat,string)

    g_ju=[]
    m_ji=[]
    l_ceng=[]
    c_xiang=[]
    n_dai=[]
    for i in match:
    k=(i.split())
    g_ju.append(k[0])
    m_ji.append(k[1].split("<i>|</i>")[1])
    if "<i>|</i>" not in k[2]:
    l_ceng.append(k[2])
    else:
    l_ceng.append(k[2].split("<i>|</i>")[1])

    if "<i>|</i>" not in k[3]:
    c_xiang.append(k[3])
    else:
    c_xiang.append(k[3].split("<i>|</i>")[1])

    if "<i>|</i>" not in k[4]:
    n_dai.append(k[4])
    else:
    n_dai.append(k[4].split("<i>|</i>")[1])

    phone_reg=r'''<a target="_blank" href="/house-xmd+/" title=(.*?)>'''
    phone_pat=re.compile(phone_reg)
    g_yu_name=re.findall(phone_pat,string)


    phone_reg=r'''<span class="tit_shop">(.*?)</span>'''
    phone_pat=re.compile(phone_reg)
    title=re.findall(phone_pat,string)

    phone_reg=r'''<span>(.*?)</span>'''
    phone_pat=re.compile(phone_reg)
    d_duan=re.findall(phone_pat,string)[::2]
    d_duan.remove(d_duan[-1])

    pd.DataFrame({'title':title,'g_ju':g_ju,
    'm_ji':m_ji,'l_ceng':l_ceng,
    'c_xiang':c_xiang,'n_dai':n_dai,
    'z_jia(万)':z_jia,'d_jia(元/m2)':d_jia,
    'g_yu_name':g_yu_name,'d_duan':d_duan},
    columns=['title','g_ju','m_ji','l_ceng','c_xiang','n_dai','z_jia(万)','d_jia(元/m2)','g_yu_name','d_duan']).to_csv("二手房.csv",index=False)

     14、

    # -*- coding: utf-8 -*-
    """
    Created on Mon Apr 29 08:32:04 2019

    @author: Office
    """
    import urllib.request
    import random
    import re

    def handle_request(url,page=None):
    if page != None:
    url=url+str(page)+".html"
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    #随机选择一个请求头
    agentStr=random.choice(agentlist)
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    request=urllib.request.Request(url,headers=headers)
    return request

    def get_text(a_href):
    #调用函数构造请求对象
    request = handle_request(a_href)
    #发送请求,获得响应
    content = urllib.request.urlopen(request).read().decode('utf-8')
    #解析内容
    pattern = re.compile(r'<div class="neirong">(.*?)</div>',re.S)
    lt = pattern.findall(content)
    text = lt[0]

    #写个正则,将内容里面所有的图片标签全部清空
    pat=re.compile(r'<img .*?>')
    text=pat.sub('',text)
    return text

    def parse_content(content):
    #写正则
    pattern=re.compile(r'<h3><a href="(/lizhi/qianming/d+.html)">(.*?)</a></h3>')
    #返回的lt是一个列表,列表中的元素都是元组,元组中的第一个元素就是正则
    #中第一个小括号匹配到的内容,元组中的第二个元素就是正则中第二个小括号
    #匹配到的内容
    lt=pattern.findall(content)
    #遍历列表
    for href_title in lt:
    #获取内容的链接
    a_href = 'http://www.yikexun.cn' + href_title[0]
    #获取标题
    title = href_title[-1]
    #向a_href发送请求,获取响应内容
    text = get_text(a_href)
    #写入到html文件中
    string = '<h1>%s</h1>%s' % (title,text)
    with open ('lizhi.html', 'a' , encoding='utf8') as f:
    f.write(string)

    def main():
    url='http://www.yikexun.cn/lizhi/qianming/list_50_'
    start_page=int(input('请输入起始页码:'))
    end_page=int(input('请输入结束页码:'))
    for page in range(start_page,end_page):
    #根据url和page去生成指定的request
    request=handle_request(url,page)
    content=urllib.request.urlopen(request).read().decode('utf-8')

    #解析内容
    parse_content(content)

    main()

    15、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Jul 15 14:16:22 2018

    @author: T0001
    """
    #爬取图片

    import urllib.request
    from lxml import etree
    import random

    url="https://www.ivsky.com/tupian/ziranfengguang/"
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }


    proxy=[
    {'http':'http://61.164.39.66:53281'} ,
    {'http':'http://116.209.57.18:9999'},
    {'http':'http://183.148.133.77:9999'},
    {'http':'http://211.23.149.29:80'},
    {'http':'http://39.137.69.10:8080'}
    ]

    end_proxy=random.choice(proxy)

    proxy_handler=urllib.request.ProxyHandler(end_proxy)

    opener=urllib.request.build_opener(proxy_handler)

    req=urllib.request.Request(url,headers=headers)

    response=opener.open(req)
    html=response.read().decode("utf-8")

    html=etree.HTML(html)
    a=html.xpath('//div[@class="il_img"]/a/@href')

    for i in a:
    url_new="https://www.ivsky.com"+i
    req1=urllib.request.Request(url,headers=headers)
    response1=opener.open(req1)
    html1=response1.read().decode("utf-8")
    html_pic=etree.HTML(html1)
    pic=html_pic.xpath('//div[@class="il_img"]/a/img/@src')

    for j in pic:
    end_url="https:"+j
    req2=urllib.request.Request(end_url,headers=headers)
    response2=opener.open(req2)
    html2=response2.read()
    with open('pic/'+j.split('/')[-1],'wb') as f:
    f.write(html2)

     16、

    # -*- coding: utf-8 -*-
    """
    Created on Wed May 1 17:33:25 2019

    @author: admin
    """
    import urllib.request
    url='http://www.baidu.com/'

    #创建handler
    handler=urllib.request.HTTPHandler()
    #创建opener
    opener=urllib.request.build_opener(handler)

    headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36'
    }
    req=urllib.request.Request(url,headers=headers)
    response=opener.open(req)
    print(response.read().decode('utf-8'))

    17、

    # -*- coding: utf-8 -*-
    """
    Created on Fri Apr 26 08:37:26 2019

    @author: Office
    """

    import urllib.request
    import random
    import re

    url="https://www.qiushibaike.com/text/page/1/"
    proxy=[
    {'http':'http://61.164.39.66:53281'} ,
    {'http':'http://116.209.57.18:9999'},
    {'http':'http://183.148.133.77:9999'},
    {'http':'http://211.23.149.29:80'},
    {'http':'http://39.137.69.10:8080'}
    ]

    end_proxy=random.choice(proxy)

    proxy_handler=urllib.request.ProxyHandler(end_proxy)

    opener=urllib.request.build_opener(proxy_handler)

    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    req=urllib.request.Request(url,headers=headers)

    response=opener.open(req)
    html=response.read().decode('utf-8')
    print(html)
    pat=r'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'
    re_joke=re.compile(pat,re.S)
    divsList=re_joke.findall(html)
    dic={}
    for i in divsList:
    #用户名
    re_u=re.compile(r'<h2>(.*?)</h2>',re.S)
    username=re_u.findall(i)
    username=username[0]

    #段子
    re_d=re.compile(r'<div class="content"> <span>(.*?)</span>',re.S)
    duanzi=re_d.findall(i)
    duanzi=duanzi[0]
    dic[username]=duanzi
    print(dic)

     18、

    # -*- coding: utf-8 -*-
    """
    Created on Wed May 1 08:50:22 2019

    @author: admin
    """

    import urllib.request
    import urllib.parse
    import http.cookiejar
    import ssl

    ssl._create_default_https_context = ssl._create_unverified_context
    #真实的模拟浏览器,当发送完post请求的时候,将cookie保存到代码中
    #创建一个cookiejar对象
    cj=http.cookiejar.CookieJar()
    #创建一个cookiejar 创建一个handler
    handler=urllib.request.HTTPCookieProcessor(cj)
    #根据handler创建一个opener
    opener=urllib.request.build_opener(handler)

    url='http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201943946542 '

    headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36'
    }

    fromdata={
    'email':'17767258153',
    'icode':'' ,
    'origURL':'http://www.renren.com/home',
    'domain':'renren.com',
    'key_id':'1',
    'captcha_type':'web_login',
    'password':'204b8409cfb80c1d46a7134d150cd281a1808d1c0429eb7334a3fa8f4c6ae327',
    'rkey':'b8871697112ad27ac3a61f5e85ebf5b4',
    'f':'http%3A%2F%2Fwww.renren.com%2F970622703',
    }

    fromdata=urllib.parse.urlencode(fromdata).encode('utf-8')
    req=urllib.request.Request(url,headers=headers)
    response=opener.open(req,data=fromdata)
    #print(response.read().decode('utf-8'))

    get_url="http://www.renren.com/970622703/profile"
    req=urllib.request.Request(get_url,headers=headers)
    response=opener.open(req)
    print(response.read().decode('utf-8'))

    19、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Apr 28 11:28:33 2019

    @author: Office
    """

    import urllib.request
    from lxml import etree
    import random
    import numpy as np
    import pandas as pd
    from sqlalchemy import create_engine

    url='http://tubiao.17mcp.com/Ssq/index-500.html'
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    req=urllib.request.Request(url,headers=headers)
    response=urllib.request.urlopen(req).read().decode('utf-8')
    html=etree.HTML(response)
    data=html.xpath('//tr/td[@style="color:White"]/text()')
    qihao=html.xpath('//tr[@style="height: 25px"]/td[1]/text()')

    da_ta=np.array(data).reshape(-1,7)
    qi_hao=np.array(qihao)

    end_data=np.column_stack((qi_hao,da_ta))

    finnal_data=pd.DataFrame(end_data,columns=['qihao','one','two','three','four','five','six','seven'])

    #保存到excel
    finnal_data.to_csv('双色球.csv',index=False)

    #保存到mysql
    engine=create_engine('mysql+pymysql://root:123456@localhost/demo')
    finnal_data.to_sql('shungseqiu',engine,if_exists="replace")

    20、

    # -*- coding: utf-8 -*-
    """
    Created on Fri Apr 26 15:36:41 2019

    @author: Office
    """

    import urllib.request
    import random
    import re

    keyname="chakra bracelet"
    key=urllib.request.quote(keyname)
    for i in range(1,2):
    try:
    print("--------正在爬第"+str(i)+"页------------")
    url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
    proxy=[
    {'http':'http://61.164.39.66:53281'} ,
    {'http':'http://116.209.57.18:9999'},
    {'http':'http://183.148.133.77:9999'},
    {'http':'http://211.23.149.29:80'},
    {'http':'http://39.137.69.10:8080'}
    ]

    end_proxy=random.choice(proxy)

    proxy_handler=urllib.request.ProxyHandler(end_proxy)

    opener=urllib.request.build_opener(proxy_handler)

    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    req=urllib.request.Request(url,headers=headers)

    response=opener.open(req)
    data=response.read().decode("utf-8","ignore")

    pat='"pic_url":"//(.*?)"'
    imglist=re.compile(pat).findall(data)
    for j in range(0,len(imglist)):
    try:
    thisimg=imglist[j]
    thisimgurl="http://"+thisimg
    localfile="D:/"+str(i)+"_"+str(j)+".jpg"
    urllib.request.urlretrieve(thisimgurl,filename=localfile)
    except Exception as err:
    pass
    except Exception as err:
    pass

     21、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jun 13 18:12:39 2019

    @author: wqq
    """

    import urllib.request
    import urllib.parse
    import ssl
    import random
    from lxml import etree
    import pandas as pd

    ssl._create_default_https_context = ssl._create_unverified_context
    url = 'https://veromoda.tmall.com/p/rd609297.htm?spm=a1z10.10672-b-s.w5001-17277175636.16.7b822b67cHKn8X&scene=taobao_shop'

    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0" ,
    "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"]
    agentstr = random.choice(agentlist)

    headers = {
    'user-agent':agentstr,
    'Accept': 'image/webp,*/*',
    'Cookie': 'cq=ccp%3D1; cna=OA95FVY8Iw4CAXAKF/liwJ5M; isg=BI2N3dH67G6QcEhAxVcwy0Dzn6nHwsFXFVAU088SySSTxq14l7rRDNtcMJoFHdn0; l=bBNzmI9HqQPbVy7kBOCwquI8aG7OSIOYYuPRwNqXi_5ay1T_qsQOkjo1oe96Vs5RsXTB4mxQgLp9-etks; hng=""; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; t=2e15a61bdd752ef76d25e931fbd573ee; lid=%E4%BD%8E%E8%B0%83leoalan; _tb_token_=e1b6ee565fbb5; cookie2=1f4e270456996b258181536824f34637'

    }

    req = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(req)
    data = response.read().decode('gbk')

    tree = etree.HTML(data)
    imdb = etree.HTML(data)

    title = imdb.xpath('//span[@class="user_name"]/text()')
    adress = imdb.xpath('//div[@class="user_w990"]//a[@target="_blank"]/@href')
    price = imdb.xpath('//span/strong[@class="user_pricetit"]/text()')
    #oldprice = imdb.xpath('//span/span[@class="user_ft14 user_yj"]/text()')

    a=0
    for i in adress:
    i = 'https:'+i
    adress[a] = i
    a+=1

    pd.DataFrame({
    '商品名称':title,
    '商品链接':adress,
    '商品价格':price
    },
    columns=['商品名称','商品链接','商品价格']
    ).to_excel('D:/天猫商品.xls')

    22、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Apr 28 19:37:12 2019

    @author: Office
    """

    import urllib.request
    import urllib.parse
    from lxml import etree
    import time
    import random
    import os

    def handle_request(url,page):
    #由于第一页和后面的页码规律不一样,所以要进行判断
    if page == 1:
    url = url.format('')
    else:
    url = url.format('_' + str(page))
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    user_angent_list=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]

    end_user_angent = random.choice(user_angent_list)
    headers['User-Agent'] = end_user_angent
    request = urllib.request.Request(url,headers=headers)
    return request
    #解析内容
    def parse_content(content):
    tree = etree.HTML(content)
    image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2')
    #懒加载
    #遍历列表,依次下载图片
    for image_src in image_list:
    download_image(image_src)

    def download_image(image_src):
    dirpath = 'xinggan'
    #创建一个文件夹
    if not os.path.exists(dirpath):
    os.mkdir(dirpath)
    #搞个文件名
    filename = os.path.basename(image_src)
    #图片路径
    filepath = os.path.join(dirpath,filename)
    #发送请求,保存图片
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    user_angent_list=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]

    end_user_angent = random.choice(user_angent_list)
    headers['User-Agent'] = end_user_angent
    request = urllib.request.Request(image_src,headers=headers)
    response = urllib.request.urlopen(request)
    with open(filepath,'wb') as f:
    f.write(response.read())

    def main():
    url = 'http://sc.chinaz.com/tupian/xingganmeinvtupian.html'
    start_page = int(input('请输入起始页码:'))
    end_page = int(input('请输入结束页码:'))
    for page in range(start_page,end_page+1):
    request = handle_request(url,page)
    content = urllib.request.urlopen(request).read().decode('utf-8')
    parse_content(content)
    time.sleep(2)

    if __name__ == '__main__':
    main()

    23、

    # -*- coding: utf-8 -*-
    """
    Created on Sat Jun 30 21:07:14 2018

    @author: Chen
    """

    import pydotplus
    import os
    os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
    import pandas as pd

    #读取csv文件到dataframe
    df = pd.read_csv('./data.csv')
    #print(df.head())#测试的时候用

    df = df[['weather','temperature','humidity','wind','sports']]
    df['weather'] = df['weather'].map({'晴': 0, '阴': 1, '雨': 2})
    df['temperature'] = df['temperature'].map({'炎热': 0, '适中': 1, '寒冷': 2})
    df['wind'] = df['wind'].map({'弱': 0, '强': 1})

    #分成事实表,和分类表
    df = df.dropna()
    X = df.drop('sports', axis=1)
    Y = df['sports']

    '''
    #分成训练集和测试集
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)
    '''

    from sklearn import tree
    model = tree.DecisionTreeClassifier()
    #用测试集学习
    #model.fit(X_train, y_train)
    #用全集学习
    model.fit(X, Y)

    '''
    #通过测试集测试模型的准确度
    y_predict = model.predict(X_test)
    from sklearn.metrics import accuracy_score
    accuracy_score(y_test, y_predict)
    '''

    #生成可视化的树
    dot_data = tree.export_graphviz(model.tree_, out_file=None,
    feature_names=X.columns,
    class_names=['no','yes'],
    filled=True, rounded=True, # leaves_parallel=True,
    special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data)

    nodes = graph.get_node_list()

    for node in nodes:
    if node.get_label():
    values = [int(ii) for ii in node.get_label().split('value = [')[1].split(']')[0].split(',')];
    color = {0: [255,255,224], 1: [255,224,255], 2: [224,255,255],}
    values = color[values.index(max(values))]; # print(values)
    color = '#{:02x}{:02x}{:02x}'.format(values[0], values[1], values[2]); # print(color)
    node.set_fillcolor(color )

    graph.write_pdf("tree.pdf")
    graph.write_png("tree.png")

    #

  • 相关阅读:
    跃迁方法论 Continuous practice
    EPI online zoom session 面试算法基础知识直播分享
    台州 OJ 2648 小希的迷宫
    洛谷 P1074 靶形数独
    洛谷 P1433 DP 状态压缩
    台州 OJ FatMouse and Cheese 深搜 记忆化搜索
    台州 OJ 2676 Tree of Tree 树状 DP
    台州 OJ 2537 Charlie's Change 多重背包 二进制优化 路径记录
    台州 OJ 2378 Tug of War
    台州 OJ 2850 Key Task BFS
  • 原文地址:https://www.cnblogs.com/wenqiang-leo/p/11669233.html
Copyright © 2011-2022 走看看