zoukankan      html  css  js  c++  java
  • brautiful抓取网页数据

    # from urllib import request
    from bs4 import BeautifulSoup
    #
    # req = request.Request("http://www.hngp.gov.cn/wsscnew/egp/public/gg_spzsxx/SpxhMainTab.html?xhbh=ff8080815c04a864015c596c4c177699&xmxh=null&area=00390019&xyghbh=ff80808151561b4701517a3e43825e4f&lastcgsl=0&cgje=0.0&lastcgje=0.0&cgsl=0&isnwwbz=ww&czy=null&lbbs=null")
    # req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
    # res = request.urlopen(req)
    # res = request.urlopen(req)
    #
    # #print(res.read().decode("utf-8"))
    # res = res.read()
    #
    # # doc = open("aa.html","wb+")
    # # doc.write(res)
    # soup = BeautifulSoup(res, 'html.parser', from_encoding='utf-8')
    # #title = soup.strong
    #
    # company = soup.find_all(target='_Blank')
    #
    # print(company)
    from urllib import request,parse
    
    login_data = parse.urlencode([
        ('formids','If,sl,jbcsPage,ghsPage,jgqsPage,picPage,spxqPage,Xzsp,Gwc,Xmxx,Dzdd,Ddys,selgys'),
        ('submitmode',''),
        ('submitname',''),
        ('If','F'),
        ('xhbh','ff8080815c04a864015c596c4c177699'),
        ('area','00390019'),
        ('ppmc','联想'),
        ('czy',''),
        ('scjg',4126.0),
        ('zdjg',4126.0),
        ('xyghbh','ff80808151561b4701517a3e43825e4f'),
        ('xmxh',''),
        ('lastcgsl',''),
        ('cgje',0),
        ('lastcgje',0),
        ('cgsl',0),
        ('isnwwbz','ww'),
        ('lbbs',''),
        ('gysdqzdbj','4126.0'),
        ('ghsmc','点击选择供应商'),
        ('sl',0),
        ('ghsPage','供货商'),
    ])
    
    # 调用request包.Request对象
    req = request.Request('http://www.hngp.gov.cn/wsscnew/egp/public/gg_spzsxx/SpxhMainTab,form.sdirect')
    
    req.add_header('Origin', 'http://www.hngp.gov.cn')
    req.add_header('Cookie','JSESSIONID=E6738337F2A4BAE45C6127C732DA7D54')
    req.add_header('User-Agent', 'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25')
    req.add_header('Referer', 'http://www.hngp.gov.cn/wsscnew/egp/public/gg_spzsxx/SpxhMainTab.html?xhbh=ff8080815c04a864015c596c4c177699&xmxh=null&area=00390019&xyghbh=ff80808151561b4701517a3e43825e4f&lastcgsl=0&cgje=0.0&lastcgje=0.0&cgsl=0&isnwwbz=ww&czy=null&lbbs=null')
    
    # 调用request.urlopen对象,请求网址
    res = request.urlopen(req,data=login_data.encode('utf-8'))
    
    # 在调用Beautiful对象之前,先读取网页内容
    res = res.read()
    
    # 选择用哪一种网页解析器解析读取的网页(选择哪种编码)
    soup = BeautifulSoup(res, 'html.parser', from_encoding='utf-8')
    tr_list = soup.findAll('tr')
    
    for tr in tr_list:
        td_list = tr_list[1].findAll('td')
    
        price = td_list[4].getText()
        name  = td_list[5].getText()
    
        if name != '韦玮' :
    
            exit()
    #print(data)
  • 相关阅读:
    SQL Server:创建索引视图
    Asp.Net常用函数
    SQL Server联机丛书:删除存储过程
    音乐知识全接触
    深入透析样式表滤镜
    有一天,爸妈会变老
    今天终于买到票啦~~
    今天,回到上海啦~~(附工作生涯回顾)
    十八问:怎么才是喜欢编程
    把旧光驱改CD播放机的方法
  • 原文地址:https://www.cnblogs.com/hanshuai0921/p/7903293.html
Copyright © 2011-2022 走看看