zoukankan      html  css  js  c++  java
  • python爬虫获取下一页

    from time import sleep
    
    import faker
    import requests
    from lxml import etree
    
    fake = faker.Faker()
    
    base_url = "http://angelimg.spbeen.com"
    
    def get_next_link(url):
        content = downloadHtml(url)
        html = etree.HTML(content)
        next_url = html.xpath("//a[@class='ch next']/@href")
        if next_url:
            return base_url + next_url[0]
        else:
            return False
    
    def downloadHtml(ur):
        user_agent = fake.user_agent()
        headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
        response = requests.get(url, headers=headers)
        return response.text
    
    def getImgUrl(content):
        html  = etree.HTML(content)
        img_url = html.xpath('//*[@id="content"]/a/img/@src')
        title = html.xpath(".//div['@class=article']/h2/text()")
    
        return img_url[0],title[0]
    
    def saveImg(title,img_url):
        if img_url is not None and title is not None:
            with open("txt/"+str(title)+".jpg",'wb') as f:
                user_agent = fake.user_agent()
                headers = {'User-Agent': user_agent,"Referer":"http://angelimg.spbeen.com/"}
                content = requests.get(img_url, headers=headers)
                #request_view(content)
                f.write(content.content)
                f.close()
    
    def request_view(response):
        import webbrowser
        request_url = response.url
        base_url = '<head><base href="%s">' %(request_url)
        base_url = base_url.encode()
        content = response.content.replace(b"<head>",base_url)
        tem_html = open('tmp.html','wb')
        tem_html.write(content)
        tem_html.close()
        webbrowser.open_new_tab('tmp.html')
    
    def crawl_img(url):
        content = downloadHtml(url)
        res = getImgUrl(content)
        title = res[1]
        img_url = res[0]
        saveImg(title,img_url)
    
    if __name__ == "__main__":
        url = "http://angelimg.spbeen.com/ang/4968/1"
    
        while url:
            print(url)
            crawl_img(url)
            url = get_next_link(url)

    还有种方式,获取到总页数,再循环 

  • 相关阅读:
    程序修炼之道——从小工到专家(3)
    组合
    子类重用父类的功能
    对象之间的交互
    属性查找与绑定方法
    类与对象的定义与使用
    hashlib模块subprocess模块
    configerparser模块
    shelve模块 xml模块
    sys模块 json pickle模块
  • 原文地址:https://www.cnblogs.com/php-linux/p/12485691.html
Copyright © 2011-2022 走看看