zoukankan      html  css  js  c++  java
  • 简单反爬虫代码

    import urllib.request

    #发起请求
    res = urllib.request.urlopen('http://www.baidu.com/')
    print(type(res))

    #获取状态码
    # print(res.getcode())

    #获取请求地址
    # print(res.geturl())

    #获取头信息
    # print(res.getheaders())

    #读取全文 以二进制方式读取网页全文 需要进行解码操作
    #编码 encode --->字节
    #解码 decode --->文本
    #utf-8 gbk gb2312
    # print(res.read().decode('utf-8'))

    # with open("baidu.html","w",encoding="utf-8") as f:
    # f.write(res.read().decode("utf-8"))

    #请求网页# url = "http://www.baidu.com/"
    # url = 'http://www.baidu.com/'
    # urllib.request.urlretrieve(url=url,filename='baidu.html')

    #请求图片
    # img_url = "http://b-ssl.duitang.com/uploads/item/201601/28/20160128084015_z3cUP.jpeg"
    # name = img_url.rsplit("/")[-1-2]
    # urllib.request.urlretrieve(url=img_url,filename=name+".jpg")


    # 请求视频
    # vedio_url = 'http://v6-default.ixigua.com/741142c2612117615b8343d7a6c12643/5cece6c6/video/m/2204a5c8c6f50be412db774c6d688b6bade1162097c100008550afaf2a34/?rc=M29wdjxrNHFxbTMzaTczM0ApQHRAbzQ5NTM7MzQzMzY3NDUzNDVvQGg1dilAZzN3KUBmM3UpZHNyZ3lrdXJneXJseHdmNzZAMnJzYW5eLV5hXy0tYS0vc3MtbyNvIzI0MC8vMS0uNDAxNTI2LTojbyM6YS1vIzpgLXAjOmB2aVxiZitgXmJmK15xbDojMy5e&vfrom=xgplayer'
    # urllib.request.urlretrieve(url=vedio_url,filename="toutiao.mp4")


    url = 'http://www.baidu.com/'
    #构造请求头
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
    }

    #构造请求
    req = urllib.request.Request(url=url,headers=headers)
    #发起请求 获得响应内容
    res = urllib.request.urlopen(req)
    #读取响应内容
    print(res.getcode())



    import urllib
    import urllib.request
    url1 = 'http://www.baidu.com/s?'
    name = input('请输入要查询的内容:')
    source = {
    'wd':name
    }
    url2 = urllib.parse.urlencode(source)
    url3 = url1+url2
    print(url3)

    res = urllib.request.urlopen(url3)
    print(res.getcode())


    # http://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3
    # %e7%be%8e%e5%a5%b3



    百度贴吧
    #导包
    import urllib
    import urllib.request
    #定义固定url
    url1 = 'http://tieba.baidu.com/f?'

    name = input('请输入要查询的贴吧名称:')
    pge = int(input('请输入要查询的页数:'))
    #定义url关键字
    source = {
    'kw':name
    }
    #关键字编码
    url2 = urllib.parse.urlencode(source)

    #拼接url
    url3 = url1+url2
    for page in range(1,pge+1):
    pn = (page-1)*50
    full_url = url3+"&pn=%s"%pn
    #发起请求
    res = urllib.request.urlopen(full_url)
    #读取
    print(res.getcode())



    # import urllib
    # # import urllib.request
    # #
    # # url='https://tieba.baidu.com/f?'

    # # name = input('请输入要查询的贴吧:')
    # # page = int(input('请输入要查询的页数:'))
    # #
    # # headers = {
    # # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
    # # "Host": "www.baidu.com",
    # # }
    # # source = {
    # # 'kw':name
    # # }
    # # new_url = urllib.parse.urlencode(source)
    # # url2 = url + new_url
    # #
    # # for i in range(1,page+1):
    # # pn = (i-1)*50
    # # full_url = url2 + "&pn=%s"%pn
    # # #构造请求
    # # req = urllib.request.Request(url=full_url,headers=headers)
    # # #获取响应
    # # res = urllib.request.urlopen(req)
    # # print(res.getcode())
    # # # urllib.request.urlretrieve(url=full_url,filename='第%s页.html'%i)



    import urllib
    import urllib.request
    #kw搜索名称
    #pn页数50为数距,初始值为0
    url = "https://tieba.baidu.com/f?"
    name = input("请输入你要搜索的贴吧:")
    page = int(input("请输入要爬取的页数:"))
    headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    "Host": "tieba.baidu.com"
    }
    source = {
    "kw":name
    }
    url1 = urllib.parse.urlencode(source)
    url2 = url + url1
    for i in range(1,page+1):
    pn = (i-1)*50
    full_url = url2 + "&pn=%s"%pn
    req = urllib.request.Request(url=full_url,headers=headers)
    res = urllib.request.urlopen(req)
    # print(res.getcode())
    urllib.request.urlretrieve(url=full_url,filename="第%s页.html"%i)

    百度翻译
    from urllib import request,parse
    import json
    # POST提交地址
    url = "https://fanyi.baidu.com/sug"
    headers ={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
    }
    name = input("请输入要翻译的内容:")
    #发起POST请求时需要的数据
    form = {
    "kw":name
    }
    # 将字典进行编码 编码完成后是一个字符串类型
    forms = parse.urlencode(form)
    #构造post请求 post请求和get请求的区别就在于是否有data参数
    #post的数据必须是字节 所以我们用bytes()函数进行转换
    req = request.Request(url=url,data=bytes(forms,encoding="utf-8"),headers=headers)
    #发起请求
    res = request.urlopen(req)
    #获得响应数据
    content = res.read().decode("utf-8")
    # print(content)
    #把一个json字符串转换为字典
    res = json.loads(content)
    print(res)
    print(res['data'][0]['v'])




    豆瓣电影
    from urllib import request,parse
    import json
    url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=40&limit=20"

    headers = {
    "User-Agent": "Safari5.0:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
    }
    form = {
    "start":"40",
    "limit":"20"
    }
    forms = parse.urlencode(form)
    req = request.Request(url=url,data=bytes(forms,encoding="utf-8"),headers=headers)
    response = request.urlopen(req)
    content = response.read().decode("utf-8")
    res = json.loads(content)
    print(res)
    for i in res:
    title = i['title']
    actors = i['actors']
    print(title)
    print(actors)






    # from urllib import request,parse
    # import json
    #
    # url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action="
    #
    # headers = {
    # "User-Agent": "Safari5.0:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4"
    # }
    #
    #
    # form = {
    # "start":"10",
    # "limit":"40"
    # }
    #
    # forms = parse.urlencode(form)
    #
    # req = request.Request(url=url,data = bytes(forms,encoding="utf-8"),headers=headers)
    #
    # response = request.urlopen(req)
    #
    # conten = response.read().decode("utf-8")
    #
    # res = json.loads(conten)
    #
    # # print(res)
    #
    # for var in res:
    # title = var["title"]
    # actors = var["actors"]
    #
    # print(title)
    # print(actors)



    模拟登陆
    from urllib import request
    url = "https://user.qzone.qq.com/2862346891"
    headers = {
    "User-Agent": "Safari5.0:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7) AppleWebKit/534.16+ (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4",
    "cookie": "tvfe_boss_uuid=58a76f49bc6ae610; pgv_pvid=7834310198; _qpsvr_localtk=0.6362081929899539; pgv_pvi=9847598080; pgv_si=s2316292096; pgv_info=ssid=s461279900; uin=o2862346891; skey=@VHWWL050N; ptisp=ctc; RK=hRqQfr4Nwh; ptcz=fc6eb6560e4aa5836aa5f0ac644eede30edc9c95840ea9126bb7462bcc1e13c6; p_uin=o2862346891; pt4_token=uvlbuVIP21ETttWCMi*1wnd2sjHOOsnfKyW3A0vMbqI_; p_skey=ynMqbG7hjPmWuwpOwlccdBS*I36xq9z3jBf83sJq7Ik_; Loading=Yes; qz_screen=1280x720; 2862346891_todaycount=0; 2862346891_totalcount=29429; QZ_FE_WEBP_SUPPORT=1; __Q_w_s_hat_seed=1; rv2=80336DBCB68C00892BB825EA0FACDE3AD70C89C10A1903D221; property20=63A613D7498074A6D1571F37A4941C048B9F458DA01089D6A40BF0D8EB5D330F6A4905911FFF9094; cpu_performance_v8=11; v6uin=2862346891|qzone_player"
    }
    req = request.Request(url=url,headers=headers)
    response = request.urlopen(req)
    content = response.read().decode("utf-8")
    with open("qq.html","w",encoding="utf-8") as f:
    f.write(content)


    肯德基店铺位置
    from urllib import request,parse
    import json
    url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
    headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"

    }
    size = int(input("请输入要查询多少条数据:"))
    form = {
    "cname": "北京",
    "pid": "",
    "pageIndex": 1,
    "pageSize": size,
    }
    forms = parse.urlencode(form)
    req = request.Request(url=url,data=bytes(forms,encoding="utf-8"),headers=headers)
    response = request.urlopen(req)
    result = response.read().decode("utf-8")
    res = json.loads(result)
    print(res)
    for i in res['Table1']:
    addressDetail = i["addressDetail"]
    storeName = i["storeName"]
    print("addressDetail:", addressDetail)
    print("storeName:", storeName)












  • 相关阅读:
    sqlserver2000及以上版本导出数据到mysql里的详细图解
    JDBC和ODBC的区别何在?什么是ODBC?
    mysql ODBC 在64位下提示找不到odbc驱动问题
    Microsoft sqlserver2000如何导入.mdf格式的数据库文件
    使用navicat 链接sql server 出现08001错误
    sql语句如何转成hql语句?
    显示所有SAP图标的ABAP代码
    SMW0上传EXCEL模板时报错无分配给对象***的MIME类型
    显示小闹钟的函数
    便携计算器
  • 原文地址:https://www.cnblogs.com/wyf2019/p/10946334.html
Copyright © 2011-2022 走看看