zoukankan      html  css  js  c++  java
  • python爬虫获取下一页

    from time import sleep
    
    import faker
    import requests
    from lxml import etree
    
    fake = faker.Faker()
    
    base_url = "http://angelimg.spbeen.com"
    
    def get_next_link(url):
        content = downloadHtml(url)
        html = etree.HTML(content)
        next_url = html.xpath("//a[@class='ch next']/@href")
        if next_url:
            return base_url + next_url[0]
        else:
            return False
    
    def downloadHtml(ur):
        user_agent = fake.user_agent()
        headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
        response = requests.get(url, headers=headers)
        return response.text
    
    def getImgUrl(content):
        html  = etree.HTML(content)
        img_url = html.xpath('//*[@id="content"]/a/img/@src')
        title = html.xpath(".//div['@class=article']/h2/text()")
    
        return img_url[0],title[0]
    
    def saveImg(title,img_url):
        if img_url is not None and title is not None:
            with open("txt/"+str(title)+".jpg",'wb') as f:
                user_agent = fake.user_agent()
                headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
                content = requests.get(img_url, headers=headers)
                #request_view(content)
                f.write(content.content)
                f.close()
    
    def request_view(response):
        import webbrowser
        request_url = response.url
        base_url = '<head><base href="%s">' %(request_url)
        base_url = base_url.encode()
        content = response.content.replace(b"<head>",base_url)
        tem_html = open('tmp.html','wb')
        tem_html.write(content)
        tem_html.close()
        webbrowser.open_new_tab('tmp.html')
    
    def crawl_img(url):
        content = downloadHtml(url)
        res = getImgUrl(content)
        title = res[1]
        img_url = res[0]
        saveImg(title,img_url)
    
    if __name__ == "__main__":
        url = "http://angelimg.spbeen.com/ang/4968/1"
    
        while url:
            print(url)
            crawl_img(url)
            url = get_next_link(url)

    还有种方式,获取到总页数,再循环 

  • 相关阅读:
    bzoj 3438: 小M的作物
    bzoj 4445 [SCOI2015] 小凸想跑步
    hdu 4899 Hero meet devil
    hdu 4898 The Revenge of the Princess’ Knight
    【NOIP1999】拦截导弹
    【OpenJudge】2991:2011 题解
    【cqbzoj】1785:残缺棋盘上放车的方案数 --状压dp --输入毁一生
    【cqbzoj】:1330 Prime DP(Ahio2001 质数和分解)
    【Openjudge:Noi】7891:一元三次方程求解 c++
    【USACO FEB 2010 SILVER】吃巧克力(Chocolate Eating)
  • 原文地址:https://www.cnblogs.com/php-linux/p/12485691.html
Copyright © 2011-2022 走看看