zoukankan      html  css  js  c++  java
  • Python_爬虫_urllib解析库

    简介:提取网页保存到txt文件中 + 解析txt文件内容,取出内容

    from urllib import request
    import re.json
    
    url="http://www.163.com"
    response = request.urlopen(url).read().decode("gbk")
    # 写入一个txt文件
    with open("163/163.txt","w",encoding="gbk")as f:
        f.write(str(response))
    # 读取,用json序列化后遍历出来,取出键里面的值
    with open("163/163.txt",'r+',encoding="utf-8")as f:
        content=f.read()
    c1=content.replace(" ",'').replace("
    ",'') #去掉空格、换行
    c2="["+c1+"]"
    cc=json.loads.(c2)  #序列化
    for i in cc:
        print(x['title'])   #取出每个项里面“title”键的值
        print(x['docurl'])
    

    简介:提取美团链接保存在一个txt文件中

    #coding=utf-8
    from urllib import request
    from bs4 import BeautifulSoup
    req=request.urlopen("http://hotel.meituan.com/xian/")
    content=req.read().decode("utf8")
    bsObj=BeautifulSoup(content,"html.parser")
    pcontent=bsObj.findAll("a",{"class":"poi-title"})
    
    i=1
    with open("meituan/url.txt","a+",encoding="utf8") as f:
            for x in pcontent:
                    f.write(x['href']+"
    ")    #取出 标签 里面的信息
                    f.write(x.get_text())    #取出正文
                    print("第"+'int(%s)'%(i)+"条url")
                    i+=1
    

    范例:功能实现后提高代码质量

    #coding=utf-8
    #获取当前地址下的所有酒店url地址
    from urllib import request,error
    from bs4 import BeautifulSoup
    import json
    for page in range(3):
            url="https://ihotel.meituan.com/hbsearch/HotelSearch?utm_medium=pc&version_name=999.9&cateId=20&attr_28=129&uuid=12B729E22135402D5CBC1432A179A735CF81DF50626153919EC2C66D46DCB233%401517811001478&cityId=42&offset="+str(page*20)+"&limit=20&startDay=20180205&endDay=20180205&q=&sort=defaults"
            try:
                    req=request.urlopen(url)
                    content=req.read().decode("utf8")
    ##                bsObj=BeautifulSoup(content,"html.parser")
    ##                pcontent=bsObj.findAll("a",{"class":"poi-title"})
                    content_dict=json.loads(content)
                    with open("meituan/url.txt","a+",encoding="utf8") as f:
                            for x in content_dict['data']['searchresult']:
                                    print(x['poiid'])
                                    hotel_url="http://hotel.meituan.com/%s/"%x['poiid']
                                    f.write(hotel_url+"
    ")
            except error.URLError as e:
                    print(e.reason)
    

    urllib添加代理IP

    # -*- coding: UTF-8 -*-
    from urllib import request
    
    if __name__ == "__main__":
        #访问网址
        url = 'http://2017.ip138.com/ic.asp'
        #url = 'http://www.whatismyip.com.tw'
        #这是代理IP
        proxy = {'http':'113.124.226.174:808'}
        #创建ProxyHandler
        proxy_support = request.ProxyHandler(proxy)
        #创建Opener
        opener = request.build_opener(proxy_support)
        #添加UserAngent
        opener.addheaders = [
            ('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'),
            ('Host','www.whatismyip.com.tw')    #这个网站能检测IP地址,所以用这个作为示例
        ]
        #安装OPener
        request.install_opener(opener)
        #使用自己安装好的Opener
        response = request.urlopen(url)
        #读取相应信息并解码
        html = response.read().decode("gbk")
        #打印信息
        print(html)
    
    
  • 相关阅读:
    OCP-1Z0-053-V13.02-252题
    Java中list.get(index)报错
    OCP-1Z0-053-V13.02-103题
    Hash unique和Sort unique
    如何解决mysql数据库8小时无连接自动关闭
    OCP-1Z0-053-V13.02-538题
    OCP-1Z0-053-V13.02-537题
    OCP-1Z0-053-V13.02-518题
    用绘本回忆青春创业经历——leo鉴书46
    OCP-1Z0-053-V13.02-502题
  • 原文地址:https://www.cnblogs.com/hellangels333/p/8602011.html
Copyright © 2011-2022 走看看