zoukankan      html  css  js  c++  java
  • Python--爬虫基础

    1、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 10:30:26 2019

    @author: Office
    """
    import urllib.request

    #需要爬取的网站
    url = "http://www.baidu.com/"

    #response:向指定的url地址发起请求,并返回http响应的数据(文件的对象)
    response = urllib.request.urlopen(url)

    #读取内容 bytes类型
    data = response.read()#读取文件的全部内容,会把读取到的数据赋值给一个字符串变量
    #data = response.readline()#读取一行,若要全部打印出来,需要写一个循环
    #data = response.readlines()#读取文件的全部内容,会把读取到的数据赋值给一个列表变量
    #print(data)
    #print(type(data))

    #将文件获取的内容转换成字符串
    str_data = data.decode("utf-8")
    #print(str_data)
    #print(type(str_data))

    #将爬取到的网页写入文件
    #第一种方法
    with open("baidu.html","w",encoding="utf-8")as f:#以str类型的方式写入文件
    f.write(str_data)

    #第二种方法,urlretrieve在执行的过程中,会残留一些缓存,需要进行清除缓存
    #urllib.request.urlretrieve(url,"baidu2.html")
    #urllib.request.urlcleanup(url,"baidu2.html") #清除缓存

    #response相关属性
    #print(response.info())#返回当前环境的有关信息
    #print(response.getcode())#返回状态码 只需记住200,304(客户端已经执行了get,但文件未变化,有缓存的意思),400(错误请求,如语法错误),500(服务器内部产生错误)
    #print(response.geturl())#返回当前正在爬取的URl地址

     2、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 15:09:34 2019

    @author: Office
    """
    import urllib.request
    url = "http://www.baidu.com/"
    #模拟请求头
    headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }

    #设置一个请求体
    req=urllib.request.Request(url,headers=headers)

    #发起请求
    response=urllib.request.urlopen(req)
    data=response.read().decode('utf-8')
    print(data)

     3、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 15:17:49 2019

    @author: Office
    """

    import urllib.request
    import random
    url = "http://www.baidu.com/"
    #模拟请求头
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    #随机选择一个请求头
    agentStr=random.choice(agentlist)
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    #设置一个请求体
    req=urllib.request.Request(url,headers=headers)

    #发起请求
    response=urllib.request.urlopen(req)
    data=response.read().decode('utf-8')
    #print(data)
    print(req.get_full_url()) #获取url地址
    print(req.get_header('User-agent')) #获取User-agent,第一个单词首字母大写,后面的单词首字母小写

    #第二种写法:
    url = "http://www.baidu.com/"
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    user_angent_list=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]

    end_user_angent=random.choice(user_angent_list)

    req=urllib.request.Request(url,headers=headers)
    req.add_header('User-Agent',end_user_angent)

    response=urllib.request.urlopen(req)
    data=response.read().decode('utf-8')
    print(data)

     4、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 16:10:42 2019

    @author: Office
    """
    import urllib.request
    url = "http://www.baidu.com/"
    #如果网页长时间未响应,系统判断超时,无法爬取
    for i in range(1,100):
    try:
    response=urllib.request.urlopen(url,timeout=0.2)
    print(len(response.read().decode('utf-8')))
    except:
    print("请求超时,继续下一个爬取")

     5、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 16:24:45 2019

    @author: Office
    """
    #http 使用场景:进行客户端与服务端之间的消息传递时使用
    #GET:通过url网址传递信息,可以直接在url网址上添加要传递的信息
    #POST:可以向服务器提交数据,是一种比较流行的比较安全的数据传递方式
    #PUT:请求服务器存储一个资源,通常要指定存储的位置
    #DELETE:请求服务器删除一个资源


    '''
    GET请求
    特点:把数据拼接到请求路径的后面传递给服务器

    优点:速度快

    缺点:承载的数据量小,不安全

    '''
    import urllib.request
    import urllib.parse
    import string
    import random

    #单个值约束
    #url='http://www.baidu.com/s?wd='
    #
    #wd='图片'
    #wd=urllib.parse.quote(wd)
    #end_url=url+wd
    #
    #headers={
    # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    # 'X-REQUESTED-With':'XMLHttpRequest',
    # 'Content-Type':'application/x-www-form-urlencoded'
    # }
    #
    #user_angent_list=[
    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    # "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    # ]
    #
    #end_user_angent=random.choice(user_angent_list)
    #
    #req=urllib.request.Request(end_url,headers=headers)
    #req.add_header('User-Agent',end_user_angent)
    #
    #response=urllib.request.urlopen(req)
    #data=response.read().decode('utf-8')
    #print(data)

    #多个值约束
    url='https://www.baidu.com/s?'
    da_ta={
    'wd':'风景',
    'key':'zhang',
    'value':'san'
    }
    final_da_ta=urllib.parse.urlencode(da_ta)

    final_url=url+final_da_ta

    end_url=urllib.parse.quote(final_url,safe=string.printable)
    print(end_url)
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    user_angent_list=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]

    end_user_angent=random.choice(user_angent_list)
    headers['User-Agent']=end_user_angent
    req=urllib.request.Request(end_url,headers=headers)
    response=urllib.request.urlopen(req)
    data=response.read().decode('utf-8')
    print(data)

     6、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Apr 28 16:50:51 2019

    @author: Office
    """
    '''
    POST 请求
    特点:把参数进行打包,单独传输

    优点:数量大,安全(当对服务器数据进行修改时建议使用post)

    缺点:速度慢
    '''

    import urllib.parse
    import urllib.request
    url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'

    headers={
    'Accept':'application/json, text/javascript, */*; q=0.01',
    'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
    'Referer':'http://fanyi.youdao.com/?keyfrom=dict2.index',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'X-Requested-With':'XMLHttpRequest'
    }
    #将要发送的数据合成一个字典
    #字典的键去网址里找,一般为input标签的name属性的值

    key=input("请输入你要翻译的内容:")
    data={
    'i' : key,
    'from' : 'AUTO',
    'to' : 'AUTO',
    'smartresult' : 'dict',
    'client' : 'fanyideskweb',
    'salt': '15564473252080',
    'sign': 'b6f44d14938df7391a28b66252a461aa',
    'doctype' : 'json',
    'version' : '2.1',
    'keyfrom' : 'fanyi.web',
    'action' : 'FY_BY_CLICKBUTTION'
    }
    #将要发送的数据进行打包,记住编码
    da_ta=urllib.parse.urlencode(data).encode('utf-8')
    #请求
    end_data=urllib.request.urlopen(url,da_ta).read().decode('utf-8')
    print(end_data)

     7、

    # -*- coding: utf-8 -*-
    """
    Created on Mon Apr 29 11:02:48 2019

    @author: Office
    """

    from bs4 import BeautifulSoup
    import urllib.request

    #转化本地文件
    soup = BeautifulSoup(open("soup_text.html",encoding="utf-8"),'lxml')

    #根据标签名查找
    #print(soup.a) 只能找到第一个符合要求的标签
    #print(soup.div)

    #获取属性
    #print(soup.a["href"])获取href属性
    #print(soup.a.attrs)获取属性和值,返回的是一个字典
    #print(soup.a.attrs["href"])也可以这样写

    #获取内容
    #print(soup.a.text)
    #print(soup.a.string)
    #print(soup.a.get_text())

    #三者的区别:如果标签中还有标签,那么string获取到的结果为None,而另外两个,可以获取文本内容
    #print(soup.div.text)
    #print(soup.div.string)
    #print(soup.div.get_text)
    #print(soup.div.get_text().split()[0])#获取里面元素

    #find 找到的都是第一个符合要求的标签
    #print(soup.find('a'))#找到第一个符合要求的a
    #print(soup.find('a',title="qin"))#通过第二条件title="qin"进行限制来查找
    #print(soup.find('a',class_="du"))#由于class是关键字,所以需要加一个下划线
    #print(soup.find('a',id="feng"))

    #find方法不仅可以适用于soup,普通的对象也可以适用,会去指定的普通对象里面去朝招符合要求的节点。
    #通过层级的方式,往下查找
    #div=soup.find('div',class_='tang')
    #print(div.find('a',alt="qi"))
    #print(div.find('a',class_="du"))#如果有两个相同,还是找到第一个符合要求的

    #find_all
    #lt=soup.find_all('a')#找所有a的标签
    #print(lt,len(lt))

    #div=soup.find('div',class_='tang')
    #print(div.find_all('a'))
    #print(div.find_all(['i','b']))#find_all 里面还可以接多个标签,以列表的形式
    #print(div.find_all('a',limit=2))#找到所有取前面2个

    #select 根据选择器选择指定的内容
    #常见的选择器:标签选择器,类选择器,id选择器,组合选择器,层级选择器,属性选择器
    #选择器返回的永远是列表,需要通过下表提取指定的对象,然后获取属性和节点
    #print(soup.select('div > u1 > li > a'))#标签和大于符号之间必须有空格
    #print(soup.select('div > u1 > li > a')[0])#取第一个
    #print(soup.select('.tang > u1 > li > a')[0])#也可以这样写,返回的结果和上面一样
    #print(soup.select('#du'))#id可以这样写
    #print(soup.select('#feng')[0].text)#返回的是一个列表,取值的话,必须先通过下表取出来,在调用获取内容的函数
    #print(soup.select('#feng')[0]['href'])#返回的是href的值

    #select 方法也可以通过普通对象调用,找到都是这个对象下面符合要求的所有节点
    #div=soup.find('div',class_='tang')
    #print(div.select('.du'))
    #print(soup.select('.du'))

     8、

    # -*- coding: utf-8 -*-
    """
    Created on Wed May 1 11:05:33 2019

    @author: admin
    """

    import urllib.request
    import urllib.parse
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context

    url='http://www.renren.com/970622703/profile'
    headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36',
    'Cookie':'anonymid=jv4jjsmt8luy21; ln_uact=17767258153; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; __guid=238633222.311801873786504100.1556674290342.3481; jebe_key=51ea37de-35c3-4754-82ed-2cc4fbe57341%7C0ff20ead6ae99fd72934c187b694b4f1%7C1556674288363%7C1%7C1556674291856; jebe_key=51ea37de-35c3-4754-82ed-2cc4fbe57341%7C0ff20ead6ae99fd72934c187b694b4f1%7C1556674288363%7C1%7C1556674291858; wp_fold=0; depovince=GW; _r01_=1; JSESSIONID=abcnRiMszrXoLbNlVdXPw; ick_login=4c390ed0-4fe6-4264-b9b2-610a614ac13c; first_login_flag=1; jebecookies=989247e8-b114-48f9-9592-aec3cd10e92b|||||; _de=7266BDD6184F288A5EF7AB01E3CFE338; p=38e98cbf34016e9010c9f1f73791f2423; t=3b04ed4095e7a4b7612203f7169bbc843; societyguester=3b04ed4095e7a4b7612203f7169bbc843; id=970622703; xnsid=8ebbfe1f; ver=7.0; loginfrom=null; monitor_count=9',
    }

    req=urllib.request.Request(url,headers=headers)
    response=urllib.request.urlopen(req)
    print(response.read().decode('utf-8'))

     9、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Jul 15 08:52:30 2018

    @author: T0001
    """

    html='''<tr>
    <td class="posterColumn">
    <span name="rk" data-value="1"></span>
    <span name="ir" data-value="9.216510839765467"></span>
    <span name="us" data-value="7.791552E11"></span>
    <span name="nv" data-value="1868842"></span>
    <span name="ur" data-value="-1.7834891602345326"></span>
    <div class="unseeable">NOT YET RELEASED</div>
    </td>
    <td class="titleColumn">
    1.
    <a href="/title/tt0111161" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman" >The Shawshank Redemption</a>
    <span class="secondaryInfo">(1994)</span>
    </td>
    <td class="ratingColumn imdbRating">
    <strong title="9.2 based on 1,868,842 user ratings">9.2</strong>
    </td>
    <td class="ratingColumn">
    <div class="seen-widget seen-widget-tt0111161 pending" data-titleid="tt0111161">
    <div class="inline">
    <div class="pending">3.2</div>
    <div class="unseeable">NOT YET RELEASED</div>
    <div class="unseen">4.5</div>
    <div class="rating"></div>
    <div class="seen">Seen</div>
    </div>
    </div>
    </td>
    <td class="watchlistColumn">
    <div class="wlb_ribbon" data-tconst="tt0111161" data-recordmetrics="true"></div>
    </td>
    </tr>
    '''

    from lxml import etree

    #本地打开
    #tree=etree.parse("文件名")

    #网络打开
    #tree=etree.HTML("网页字符串")

    imdb=etree.HTML(html)

    #属性定位
    #print(imdb.xpath('//span[@name="ir"]'))
    #print(imdb.xpath('//div[@data-tconst]'))

    #层级和索引混合定位
    #print(imdb.xpath('//div[@class="seen-widget seen-widget-tt0111161 pending"]/div/div[1]'))#索引从1开始
    #print(imdb.xpath('//div[@class="seen-widget seen-widget-tt0111161 pending"]/div/div[@class="unseeable"]'))#也可以通过属性定位

    #print(imdb.xpath('//td[@class="ratingColumn"]//div'))#<td class="ratingColumn">下面所有的div
    #print(imdb.xpath('//td[@class="ratingColumn"]//div[@class="seen"]'))#后面也可以用属性来定位

    #result1=imdb.xpath('//div[@class="inline"]/div[last()-2]')#


    #逻辑运算
    #print(imdb.xpath('//div[@class="wlb_ribbon"and @data-tconst="tt0111161"]')) 如果一个属性不能限制,也可以加一个属性,他们之间用and

    #模糊匹配
    #print(imdb.xpath('//div[contains(@class,"un")]'))#所有的div ,有class属性,并且属性中含有un的节点
    #print(imdb.xpath('//div[contains(text(),4)]'))#所有的值 ,含有4的节点
    #print(imdb.xpath('//div[starts-with(@class,"r")]'))#所有的div ,有class属性,并且属性中以r开头的节点

    #取文本内容
    #print(imdb.xpath('//div[@class="inline"]/div[5]/text()'))获取节点内容

    #取属性
    #print(imdb.xpath('//div[@class="inline"]/div[2]/@class'))

    #print(imdb.xpath('//div[@class="inline"]//text()'))#将<div class="inline">后面的所有节点里面不带标签的所有内容给取出来

    #print(imdb.xpath('//div[@class="inline"]/div[last()-1]/@class'))#也可以通过这种方式获取

    #将1.给取出来
    #s=imdb.xpath('//td[@class="titleColumn"]/text()')
    #a=[]
    #for i in s:
    # if i.strip() != "":
    # a.append(i.strip())

    #s=imdb.xpath('//td[@class="titleColumn"]')
    #k=s[0].xpath('string(.)')
    #l=k.replace(' ', '').replace(' ', '')
    #print(l.strip().split()[0])

    #for i in result:
    # print(etree.tostring(i))

     10、

    # -*- coding: utf-8 -*-
    """
    Created on Wed May 1 11:13:30 2019

    @author: admin
    """

    import urllib.request
    import urllib.parse


    url='http://www.baidu.com/'
    proxy={
    'http':'222.135.92.68:38094'
    }

    #创建handler
    handler=urllib.request.ProxyHandler(proxy)
    #创建opener
    opener=urllib.request.build_opener(handler)

    headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36'
    }
    req=urllib.request.Request(url,headers=headers)
    response=opener.open(req)
    print(response.read().decode('utf-8'))

     11、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Jul 15 11:37:22 2018

    @author: T0001
    """

    from lxml import etree
    import numpy as np
    import pandas as pd
    import urllib.request
    import random
    url='http://news.ceic.ac.cn/'
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    headers['User-Agent']=agentStr
    req=urllib.request.Request(url,headers=headers)
    response=urllib.request.urlopen(req).read().decode('utf-8')

    earth=etree.HTML(response)
    result=earth.xpath('//td[@align="center"]/text()')
    result1=earth.xpath('//td[@align="left"]/a/text()')

    data=np.array(result).reshape((-1,5))

    c=np.column_stack((data,result1))

    pd.DataFrame(c,columns=['gf','gdf','dsf','dsgf','fdg','dfgh']).to_csv('dz.csv',index=False)

     12、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Apr 25 19:06:01 2019

    @author: Office
    """
    import urllib.request
    import ssl
    import random
    import json
    import pandas as pd
    from sqlalchemy import create_engine
    #模拟请求头
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    #随机选择一个请求头
    agentStr=random.choice(agentlist)
    def ajaxCrawler(url):
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    req=urllib.request.Request(url,headers=headers)
    # 使用ssl创建未验证的上下文
    context=ssl._create_unverified_context()
    response=urllib.request.urlopen(req,context=context)

    jsonStr=response.read().decode('utf-8')
    jsonData=json.loads(jsonStr)
    return jsonData

    title=[]
    score=[]
    release_date=[]
    vote_count=[]
    for i in range(1,100):
    url='https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start='+str(i*20)+'&limit=20'
    info=ajaxCrawler(url)
    for j in info:
    title.append(j["title"])
    score.append(j['score'])
    release_date.append(j['release_date'])
    vote_count.append(j['vote_count'])

    #转化为DataFrame
    data=pd.DataFrame({'score':score,'title':title,'release_date':release_date,'vote_count':vote_count},columns=['score','title','release_date','vote_count'])
    #保存到excel
    #data.to_csv('dy.csv')

    #保存到mysql
    engine=create_engine('mysql+pymysql://root:123456@localhost/demo')
    data.to_sql('douban',engine,if_exists="replace")

     13、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jun 13 20:12:39 2019

    @author: wqq
    """

    import urllib.request
    import re
    import random
    import gzip
    import numpy as np
    import pandas as pd
    url="http://esf.hz.fang.com/"
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    headers['User-Agent']=agentStr
    req=urllib.request.Request(url,headers=headers)

    response=urllib.request.urlopen(req)
    #print(response.info().get('Content-Encoding'))
    string=gzip.decompress(response.read()).decode('gbk')

    phone_reg=r'''<span class="red"><b>(.*?)</b>'''
    phone_pat=re.compile(phone_reg)
    z_jia=re.findall(phone_pat,string)

    phone_reg=r'''<span>(.*?)元/㎡</span>'''
    phone_pat=re.compile(phone_reg)
    d_jia=re.findall(phone_pat,string)


    phone_reg=r'''<p class="tel_shop">(.*?)<span class="people_name">'''
    phone_pat=re.compile(phone_reg,re.S)
    match=re.findall(phone_pat,string)

    g_ju=[]
    m_ji=[]
    l_ceng=[]
    c_xiang=[]
    n_dai=[]
    for i in match:
    k=(i.split())
    g_ju.append(k[0])
    m_ji.append(k[1].split("<i>|</i>")[1])
    if "<i>|</i>" not in k[2]:
    l_ceng.append(k[2])
    else:
    l_ceng.append(k[2].split("<i>|</i>")[1])

    if "<i>|</i>" not in k[3]:
    c_xiang.append(k[3])
    else:
    c_xiang.append(k[3].split("<i>|</i>")[1])

    if "<i>|</i>" not in k[4]:
    n_dai.append(k[4])
    else:
    n_dai.append(k[4].split("<i>|</i>")[1])

    phone_reg=r'''<a target="_blank" href="/house-xmd+/" title=(.*?)>'''
    phone_pat=re.compile(phone_reg)
    g_yu_name=re.findall(phone_pat,string)


    phone_reg=r'''<span class="tit_shop">(.*?)</span>'''
    phone_pat=re.compile(phone_reg)
    title=re.findall(phone_pat,string)

    phone_reg=r'''<span>(.*?)</span>'''
    phone_pat=re.compile(phone_reg)
    d_duan=re.findall(phone_pat,string)[::2]
    d_duan.remove(d_duan[-1])

    pd.DataFrame({'title':title,'g_ju':g_ju,
    'm_ji':m_ji,'l_ceng':l_ceng,
    'c_xiang':c_xiang,'n_dai':n_dai,
    'z_jia(万)':z_jia,'d_jia(元/m2)':d_jia,
    'g_yu_name':g_yu_name,'d_duan':d_duan},
    columns=['title','g_ju','m_ji','l_ceng','c_xiang','n_dai','z_jia(万)','d_jia(元/m2)','g_yu_name','d_duan']).to_csv("二手房.csv",index=False)

     14、

    # -*- coding: utf-8 -*-
    """
    Created on Mon Apr 29 08:32:04 2019

    @author: Office
    """
    import urllib.request
    import random
    import re

    def handle_request(url,page=None):
    if page != None:
    url=url+str(page)+".html"
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    #随机选择一个请求头
    agentStr=random.choice(agentlist)
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    request=urllib.request.Request(url,headers=headers)
    return request

    def get_text(a_href):
    #调用函数构造请求对象
    request = handle_request(a_href)
    #发送请求,获得响应
    content = urllib.request.urlopen(request).read().decode('utf-8')
    #解析内容
    pattern = re.compile(r'<div class="neirong">(.*?)</div>',re.S)
    lt = pattern.findall(content)
    text = lt[0]

    #写个正则,将内容里面所有的图片标签全部清空
    pat=re.compile(r'<img .*?>')
    text=pat.sub('',text)
    return text

    def parse_content(content):
    #写正则
    pattern=re.compile(r'<h3><a href="(/lizhi/qianming/d+.html)">(.*?)</a></h3>')
    #返回的lt是一个列表,列表中的元素都是元组,元组中的第一个元素就是正则
    #中第一个小括号匹配到的内容,元组中的第二个元素就是正则中第二个小括号
    #匹配到的内容
    lt=pattern.findall(content)
    #遍历列表
    for href_title in lt:
    #获取内容的链接
    a_href = 'http://www.yikexun.cn' + href_title[0]
    #获取标题
    title = href_title[-1]
    #向a_href发送请求,获取响应内容
    text = get_text(a_href)
    #写入到html文件中
    string = '<h1>%s</h1>%s' % (title,text)
    with open ('lizhi.html', 'a' , encoding='utf8') as f:
    f.write(string)

    def main():
    url='http://www.yikexun.cn/lizhi/qianming/list_50_'
    start_page=int(input('请输入起始页码:'))
    end_page=int(input('请输入结束页码:'))
    for page in range(start_page,end_page):
    #根据url和page去生成指定的request
    request=handle_request(url,page)
    content=urllib.request.urlopen(request).read().decode('utf-8')

    #解析内容
    parse_content(content)

    main()

    15、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Jul 15 14:16:22 2018

    @author: T0001
    """
    #爬取图片

    import urllib.request
    from lxml import etree
    import random

    url="https://www.ivsky.com/tupian/ziranfengguang/"
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }


    proxy=[
    {'http':'http://61.164.39.66:53281'} ,
    {'http':'http://116.209.57.18:9999'},
    {'http':'http://183.148.133.77:9999'},
    {'http':'http://211.23.149.29:80'},
    {'http':'http://39.137.69.10:8080'}
    ]

    end_proxy=random.choice(proxy)

    proxy_handler=urllib.request.ProxyHandler(end_proxy)

    opener=urllib.request.build_opener(proxy_handler)

    req=urllib.request.Request(url,headers=headers)

    response=opener.open(req)
    html=response.read().decode("utf-8")

    html=etree.HTML(html)
    a=html.xpath('//div[@class="il_img"]/a/@href')

    for i in a:
    url_new="https://www.ivsky.com"+i
    req1=urllib.request.Request(url,headers=headers)
    response1=opener.open(req1)
    html1=response1.read().decode("utf-8")
    html_pic=etree.HTML(html1)
    pic=html_pic.xpath('//div[@class="il_img"]/a/img/@src')

    for j in pic:
    end_url="https:"+j
    req2=urllib.request.Request(end_url,headers=headers)
    response2=opener.open(req2)
    html2=response2.read()
    with open('pic/'+j.split('/')[-1],'wb') as f:
    f.write(html2)

     16、

    # -*- coding: utf-8 -*-
    """
    Created on Wed May 1 17:33:25 2019

    @author: admin
    """
    import urllib.request
    url='http://www.baidu.com/'

    #创建handler
    handler=urllib.request.HTTPHandler()
    #创建opener
    opener=urllib.request.build_opener(handler)

    headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36'
    }
    req=urllib.request.Request(url,headers=headers)
    response=opener.open(req)
    print(response.read().decode('utf-8'))

    17、

    # -*- coding: utf-8 -*-
    """
    Created on Fri Apr 26 08:37:26 2019

    @author: Office
    """

    import urllib.request
    import random
    import re

    url="https://www.qiushibaike.com/text/page/1/"
    proxy=[
    {'http':'http://61.164.39.66:53281'} ,
    {'http':'http://116.209.57.18:9999'},
    {'http':'http://183.148.133.77:9999'},
    {'http':'http://211.23.149.29:80'},
    {'http':'http://39.137.69.10:8080'}
    ]

    end_proxy=random.choice(proxy)

    proxy_handler=urllib.request.ProxyHandler(end_proxy)

    opener=urllib.request.build_opener(proxy_handler)

    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    req=urllib.request.Request(url,headers=headers)

    response=opener.open(req)
    html=response.read().decode('utf-8')
    print(html)
    pat=r'<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'
    re_joke=re.compile(pat,re.S)
    divsList=re_joke.findall(html)
    dic={}
    for i in divsList:
    #用户名
    re_u=re.compile(r'<h2>(.*?)</h2>',re.S)
    username=re_u.findall(i)
    username=username[0]

    #段子
    re_d=re.compile(r'<div class="content"> <span>(.*?)</span>',re.S)
    duanzi=re_d.findall(i)
    duanzi=duanzi[0]
    dic[username]=duanzi
    print(dic)

     18、

    # -*- coding: utf-8 -*-
    """
    Created on Wed May 1 08:50:22 2019

    @author: admin
    """

    import urllib.request
    import urllib.parse
    import http.cookiejar
    import ssl

    ssl._create_default_https_context = ssl._create_unverified_context
    #真实的模拟浏览器,当发送完post请求的时候,将cookie保存到代码中
    #创建一个cookiejar对象
    cj=http.cookiejar.CookieJar()
    #创建一个cookiejar 创建一个handler
    handler=urllib.request.HTTPCookieProcessor(cj)
    #根据handler创建一个opener
    opener=urllib.request.build_opener(handler)

    url='http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=201943946542 '

    headers={
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/63.0.3239.132 Safari/537.36'
    }

    fromdata={
    'email':'17767258153',
    'icode':'' ,
    'origURL':'http://www.renren.com/home',
    'domain':'renren.com',
    'key_id':'1',
    'captcha_type':'web_login',
    'password':'204b8409cfb80c1d46a7134d150cd281a1808d1c0429eb7334a3fa8f4c6ae327',
    'rkey':'b8871697112ad27ac3a61f5e85ebf5b4',
    'f':'http%3A%2F%2Fwww.renren.com%2F970622703',
    }

    fromdata=urllib.parse.urlencode(fromdata).encode('utf-8')
    req=urllib.request.Request(url,headers=headers)
    response=opener.open(req,data=fromdata)
    #print(response.read().decode('utf-8'))

    get_url="http://www.renren.com/970622703/profile"
    req=urllib.request.Request(get_url,headers=headers)
    response=opener.open(req)
    print(response.read().decode('utf-8'))

    19、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Apr 28 11:28:33 2019

    @author: Office
    """

    import urllib.request
    from lxml import etree
    import random
    import numpy as np
    import pandas as pd
    from sqlalchemy import create_engine

    url='http://tubiao.17mcp.com/Ssq/index-500.html'
    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    req=urllib.request.Request(url,headers=headers)
    response=urllib.request.urlopen(req).read().decode('utf-8')
    html=etree.HTML(response)
    data=html.xpath('//tr/td[@style="color:White"]/text()')
    qihao=html.xpath('//tr[@style="height: 25px"]/td[1]/text()')

    da_ta=np.array(data).reshape(-1,7)
    qi_hao=np.array(qihao)

    end_data=np.column_stack((qi_hao,da_ta))

    finnal_data=pd.DataFrame(end_data,columns=['qihao','one','two','three','four','five','six','seven'])

    #保存到excel
    finnal_data.to_csv('双色球.csv',index=False)

    #保存到mysql
    engine=create_engine('mysql+pymysql://root:123456@localhost/demo')
    finnal_data.to_sql('shungseqiu',engine,if_exists="replace")

    20、

    # -*- coding: utf-8 -*-
    """
    Created on Fri Apr 26 15:36:41 2019

    @author: Office
    """

    import urllib.request
    import random
    import re

    keyname="chakra bracelet"
    key=urllib.request.quote(keyname)
    for i in range(1,2):
    try:
    print("--------正在爬第"+str(i)+"页------------")
    url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
    proxy=[
    {'http':'http://61.164.39.66:53281'} ,
    {'http':'http://116.209.57.18:9999'},
    {'http':'http://183.148.133.77:9999'},
    {'http':'http://211.23.149.29:80'},
    {'http':'http://39.137.69.10:8080'}
    ]

    end_proxy=random.choice(proxy)

    proxy_handler=urllib.request.ProxyHandler(end_proxy)

    opener=urllib.request.build_opener(proxy_handler)

    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]
    agentStr=random.choice(agentlist)

    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent':agentStr,
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }
    req=urllib.request.Request(url,headers=headers)

    response=opener.open(req)
    data=response.read().decode("utf-8","ignore")

    pat='"pic_url":"//(.*?)"'
    imglist=re.compile(pat).findall(data)
    for j in range(0,len(imglist)):
    try:
    thisimg=imglist[j]
    thisimgurl="http://"+thisimg
    localfile="D:/"+str(i)+"_"+str(j)+".jpg"
    urllib.request.urlretrieve(thisimgurl,filename=localfile)
    except Exception as err:
    pass
    except Exception as err:
    pass

     21、

    # -*- coding: utf-8 -*-
    """
    Created on Thu Jun 13 18:12:39 2019

    @author: wqq
    """

    import urllib.request
    import urllib.parse
    import ssl
    import random
    from lxml import etree
    import pandas as pd

    ssl._create_default_https_context = ssl._create_unverified_context
    url = 'https://veromoda.tmall.com/p/rd609297.htm?spm=a1z10.10672-b-s.w5001-17277175636.16.7b822b67cHKn8X&scene=taobao_shop'

    agentlist=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0" ,
    "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0"]
    agentstr = random.choice(agentlist)

    headers = {
    'user-agent':agentstr,
    'Accept': 'image/webp,*/*',
    'Cookie': 'cq=ccp%3D1; cna=OA95FVY8Iw4CAXAKF/liwJ5M; isg=BI2N3dH67G6QcEhAxVcwy0Dzn6nHwsFXFVAU088SySSTxq14l7rRDNtcMJoFHdn0; l=bBNzmI9HqQPbVy7kBOCwquI8aG7OSIOYYuPRwNqXi_5ay1T_qsQOkjo1oe96Vs5RsXTB4mxQgLp9-etks; hng=""; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; x=__ll%3D-1%26_ato%3D0; t=2e15a61bdd752ef76d25e931fbd573ee; lid=%E4%BD%8E%E8%B0%83leoalan; _tb_token_=e1b6ee565fbb5; cookie2=1f4e270456996b258181536824f34637'

    }

    req = urllib.request.Request(url,headers=headers)
    response = urllib.request.urlopen(req)
    data = response.read().decode('gbk')

    tree = etree.HTML(data)
    imdb = etree.HTML(data)

    title = imdb.xpath('//span[@class="user_name"]/text()')
    adress = imdb.xpath('//div[@class="user_w990"]//a[@target="_blank"]/@href')
    price = imdb.xpath('//span/strong[@class="user_pricetit"]/text()')
    #oldprice = imdb.xpath('//span/span[@class="user_ft14 user_yj"]/text()')

    a=0
    for i in adress:
    i = 'https:'+i
    adress[a] = i
    a+=1

    pd.DataFrame({
    '商品名称':title,
    '商品链接':adress,
    '商品价格':price
    },
    columns=['商品名称','商品链接','商品价格']
    ).to_excel('D:/天猫商品.xls')

    22、

    # -*- coding: utf-8 -*-
    """
    Created on Sun Apr 28 19:37:12 2019

    @author: Office
    """

    import urllib.request
    import urllib.parse
    from lxml import etree
    import time
    import random
    import os

    def handle_request(url,page):
    #由于第一页和后面的页码规律不一样,所以要进行判断
    if page == 1:
    url = url.format('')
    else:
    url = url.format('_' + str(page))
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    user_angent_list=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]

    end_user_angent = random.choice(user_angent_list)
    headers['User-Agent'] = end_user_angent
    request = urllib.request.Request(url,headers=headers)
    return request
    #解析内容
    def parse_content(content):
    tree = etree.HTML(content)
    image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2')
    #懒加载
    #遍历列表,依次下载图片
    for image_src in image_list:
    download_image(image_src)

    def download_image(image_src):
    dirpath = 'xinggan'
    #创建一个文件夹
    if not os.path.exists(dirpath):
    os.mkdir(dirpath)
    #搞个文件名
    filename = os.path.basename(image_src)
    #图片路径
    filepath = os.path.join(dirpath,filename)
    #发送请求,保存图片
    headers={
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'X-REQUESTED-With':'XMLHttpRequest',
    'Content-Type':'application/x-www-form-urlencoded'
    }

    user_angent_list=[
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0"
    ]

    end_user_angent = random.choice(user_angent_list)
    headers['User-Agent'] = end_user_angent
    request = urllib.request.Request(image_src,headers=headers)
    response = urllib.request.urlopen(request)
    with open(filepath,'wb') as f:
    f.write(response.read())

    def main():
    url = 'http://sc.chinaz.com/tupian/xingganmeinvtupian.html'
    start_page = int(input('请输入起始页码:'))
    end_page = int(input('请输入结束页码:'))
    for page in range(start_page,end_page+1):
    request = handle_request(url,page)
    content = urllib.request.urlopen(request).read().decode('utf-8')
    parse_content(content)
    time.sleep(2)

    if __name__ == '__main__':
    main()

    23、

    # -*- coding: utf-8 -*-
    """
    Created on Sat Jun 30 21:07:14 2018

    @author: Chen
    """

    import pydotplus
    import os
    os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
    import pandas as pd

    #读取csv文件到dataframe
    df = pd.read_csv('./data.csv')
    #print(df.head())#测试的时候用

    df = df[['weather','temperature','humidity','wind','sports']]
    df['weather'] = df['weather'].map({'晴': 0, '阴': 1, '雨': 2})
    df['temperature'] = df['temperature'].map({'炎热': 0, '适中': 1, '寒冷': 2})
    df['wind'] = df['wind'].map({'弱': 0, '强': 1})

    #分成事实表,和分类表
    df = df.dropna()
    X = df.drop('sports', axis=1)
    Y = df['sports']

    '''
    #分成训练集和测试集
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)
    '''

    from sklearn import tree
    model = tree.DecisionTreeClassifier()
    #用测试集学习
    #model.fit(X_train, y_train)
    #用全集学习
    model.fit(X, Y)

    '''
    #通过测试集测试模型的准确度
    y_predict = model.predict(X_test)
    from sklearn.metrics import accuracy_score
    accuracy_score(y_test, y_predict)
    '''

    #生成可视化的树
    dot_data = tree.export_graphviz(model.tree_, out_file=None,
    feature_names=X.columns,
    class_names=['no','yes'],
    filled=True, rounded=True, # leaves_parallel=True,
    special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data)

    nodes = graph.get_node_list()

    for node in nodes:
    if node.get_label():
    values = [int(ii) for ii in node.get_label().split('value = [')[1].split(']')[0].split(',')];
    color = {0: [255,255,224], 1: [255,224,255], 2: [224,255,255],}
    values = color[values.index(max(values))]; # print(values)
    color = '#{:02x}{:02x}{:02x}'.format(values[0], values[1], values[2]); # print(color)
    node.set_fillcolor(color )

    graph.write_pdf("tree.pdf")
    graph.write_png("tree.png")

    #

  • 相关阅读:
    bootstap 折叠
    AtCoder AGC019E Shuffle and Swap (DP、FFT、多项式求逆、多项式快速幂)
    Codeforces Gym 101630J Journey from Petersburg to Moscow (最短路)
    BZOJ 4042 Luogu P4757 [CERC2014]Parades (树形DP、状压DP)
    BZOJ 2734 [HNOI2012]集合选数 (状压DP、时间复杂度分析)
    BZOJ 2759 一个动态树好题 (LCT)
    Codeforces 1205C Palindromic Paths (交互题、DP)
    getopt实现传参自动识别
    powershell笔记
    bat语法需要注意的地方
  • 原文地址:https://www.cnblogs.com/wenqiang-leo/p/11669233.html
Copyright © 2011-2022 走看看