zoukankan      html  css  js  c++  java
  • python(初学提取html页面元素,借用老师)

     -*- coding: utf-8 -*-
    import urllib2
    
    import os
    
    def mean_audience_score(id):
        arv = 0.0
        sc_url = "http://movie.mtime.com/" + id + "/"
        sc_req = urllib2.Request(sc_url, headers={'User-Agent': "Magic Browser"})
        sc_page = urllib2.urlopen(sc_req)
        sc_strw = sc_page.read()
    
        sc_str = re.findall(r'<span class="db_point ml6">+d+.+d+</span>', sc_strw)
        if len(sc_str) == 0:
            return arv
        for tt in sc_str:
            scsc = re.findall(r'd+.+d', tt)
            arv += float(scsc[0])
        return arv / len(sc_str)
    
    url = 'http://theater.mtime.com/China_Anhui_Province_Wuhu/'
    req = urllib2.Request(url,headers={'User-Agent' : "Magic Browser"})
    webpage = urllib2.urlopen(req)
    strw = webpage.read()*0
    print strw
    tg_start = strw.find('hotplaySvList = [')
    print tg_start
    if tg_start == -1:
    	print 'not find start tag'
    	os._exit(0)
    tmp = strw[tg_start:-1]
    print tmp
    tg_end = tmp.find(';')
    print tg_end
    if tg_end == -1 :
        print 'not find end tag'
        os._exit(0)
    tmp = tmp[len('hotplaySvList = ['):tg_end]
    print tmp
    tar_ls = tmp.split("},{")
    dict_film = {}
    for t0 in tar_ls:
        ls_t = t0.split(',')
        id = ls_t[0].split(':')[-1].strip()
        film = ls_t[-1].split('"')[-2].strip()
        dict_film[id] = film
    for t in dict_film:
        print "id:" + t + "  film:" + dict_film[t]
    
  • 相关阅读:
    第7次实践作业 25组
    第6次实践作业 25组
    第5次实践作业
    第4次实践作业
    第3次实践作业
    第2次实践作业
    第1次实践作业
    软工实践个人总结
    2019 SDN大作业
    C语言Il作业01
  • 原文地址:https://www.cnblogs.com/doublekai/p/6857778.html
Copyright © 2011-2022 走看看