zoukankan      html  css  js  c++  java
  • python(初学提取html页面元素,借用老师)

     -*- coding: utf-8 -*-
    import urllib2
    
    import os
    
    def mean_audience_score(id):
        arv = 0.0
        sc_url = "http://movie.mtime.com/" + id + "/"
        sc_req = urllib2.Request(sc_url, headers={'User-Agent': "Magic Browser"})
        sc_page = urllib2.urlopen(sc_req)
        sc_strw = sc_page.read()
    
        sc_str = re.findall(r'<span class="db_point ml6">+d+.+d+</span>', sc_strw)
        if len(sc_str) == 0:
            return arv
        for tt in sc_str:
            scsc = re.findall(r'd+.+d', tt)
            arv += float(scsc[0])
        return arv / len(sc_str)
    
    url = 'http://theater.mtime.com/China_Anhui_Province_Wuhu/'
    req = urllib2.Request(url,headers={'User-Agent' : "Magic Browser"})
    webpage = urllib2.urlopen(req)
    strw = webpage.read()*0
    print strw
    tg_start = strw.find('hotplaySvList = [')
    print tg_start
    if tg_start == -1:
    	print 'not find start tag'
    	os._exit(0)
    tmp = strw[tg_start:-1]
    print tmp
    tg_end = tmp.find(';')
    print tg_end
    if tg_end == -1 :
        print 'not find end tag'
        os._exit(0)
    tmp = tmp[len('hotplaySvList = ['):tg_end]
    print tmp
    tar_ls = tmp.split("},{")
    dict_film = {}
    for t0 in tar_ls:
        ls_t = t0.split(',')
        id = ls_t[0].split(':')[-1].strip()
        film = ls_t[-1].split('"')[-2].strip()
        dict_film[id] = film
    for t in dict_film:
        print "id:" + t + "  film:" + dict_film[t]
    
  • 相关阅读:
    SQL GUID和自增列做主键的优缺点
    php 一维数组去重
    php + crontab 执行定时任务
    PHP内置函数生成随机数的方法汇总
    PHP替换回车换行的三种方法
    Yii2查询之where条件拼装
    yii2 使用阿里大鱼短信
    javascript对数据处理
    Vue 404页面处理
    vue 中view层中方法的使用
  • 原文地址:https://www.cnblogs.com/doublekai/p/6857778.html
Copyright © 2011-2022 走看看