zoukankan      html  css  js  c++  java
  • python 爬虫示例,方便日后参考

    参考网址:https://zhuanlan.zhihu.com/p/32037625

    def getOneMoviesInfo(Mid,url):
        import requests
        from lxml import etree
        
        #print(url)
        data = requests.get(url).text   #download the website
        s = etree.HTML(data)            #analyse data
    
        picture = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[1]/div[1]/img/@src')
        if len(picture)== 0:
            picture = 'NULL'
        #longPicture = s.xpath('//*[@id="media_v4"]/div[2]/div[1]/div/div/section[3]/div[2]/div/div[1]/img/@src')
        name = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()')
        if len(name)==0:
            print("Mid = %s , failed for a lack of TMDB id "%Mid)
            return
        name = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()')[0]
        year = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/span/text()')[0].strip("(").strip().strip(")")
        date = s.xpath('//*[@id="media_v4"]/div[2]/div[2]/div/section/div[1]/div/section[1]/ul/li[1]/text()')[1].strip()
        brief = s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/div/p/text()')[0].replace("
    ","\n")
    
        mainCreators =s.xpath('//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/ol/li') #all main creators array
        writers = []
        director = "NULL"
        for div in mainCreators:
            if len(div.xpath('./p[1]/a/text()'))== 0:
                director = 'NULL'
                writers = ['NULL','NULL','NULL']
            else:            
                creatorName = div.xpath('./p[1]/a/text()')[0]
                #print(creatorName)
                creatorProfession = div.xpath('./p[2]/text()')[0]
                #print(creatorProfession)
                if  'Director' in creatorProfession:
                    director = creatorName
                elif 'Screenplay' in creatorProfession or 'Writer' in creatorProfession:
                    writers.append(creatorName)
        
            
        stars = []
        starsData = s.xpath('//*[@id="media_v4"]/div[2]/div[1]/div/div/section[1]/ol/li')
        for div in starsData:
            star = div.xpath('./p[1]/a/text()')
            if len(star)== 0:
                stars == ["NULL","NULL","NULL"]
            else:
                star = star[0]
                stars.append(star)
                    
        
        writerslen = len(writers)
        starslen=len(stars)
        
        for i in range(writerslen,3):
            writers.append("NULL");
        for i in range(starslen,5):
            stars.append("NULL");
        
        with open(r'C:UsersyuqiaoDesktop	estSpider.txt','a',encoding='utf-8') as f:
            f.write("{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}
    ".format(Mid,name,brief,year,date,director,
                                                     writers[0],writers[1],writers[2],
                                                     stars[0],stars[1],stars[2],stars[3],stars[4],
                                                     picture))
        print(Mid)
        print(name)
        
    #______________________________________________________主函数__________________________________________________________
    import time
    with open(r'C:UsersyuqiaoDesktop	estSpider.txt','w',encoding='utf-8') as f:
            f.write("")
    language = '?language=zh-CN' #######################
    with open(r'D:gitiyeMovieMidURL.txt', "rt",encoding='utf-8') as in_file:
        all = in_file.read()
        lines = all.split("
    ")
        
        #for i in range(51,61):    51~60
        for i in range(9124,9125):
            line = lines[i]
            print(line)
    
    print('finished')    
    
    
    
  • 相关阅读:
    slf4j绑定log4j失败
    [转]activiti5用户任务分配
    关于ajax提交的公共接口的一大用处
    jQuery插件开发方式
    centos7安装mysql
    Centos7安装JDK
    奇葩问题:spring+mybaits项目突然出现其中一些Mapper类找不到
    JAVA多线程下,获取递增的序列号
    库存扣减的流水账记录问题
    My97DatePicker使用的问题
  • 原文地址:https://www.cnblogs.com/YuQiao0303/p/9277666.html
Copyright © 2011-2022 走看看