zoukankan      html  css  js  c++  java
  • python-爬虫

    No1:

    # -*- coding:utf-8 -*-
    
    import urllib.request
    
    import urllib
    
    ua_headers = {"User-Agent": "..."}
    request = urllib.request.Request("http://www.baidu.com", headers=ua_headers)
    response = urllib.request.urlopen(request)
    html = response.read()
    print(html)

    No2:

    # -*- coding:utf-8 -*-
    import urllib.request
    from urllib import parse
    
    
    def loadPage(url, filename):
        print("正在下载" + filename)
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
        request = urllib.request.Request(url, headers=headers)
        return urllib.request.urlopen(request).read()
    
    
    def writePage(html, filename):
        print("正在保存" + filename)
        with open(filename, "wb+") as f:
            f.write(html)
        print("-" * 30)
    
    
    def tiebaSpider(url, beginPage, endPage):
        for page in range(beginPage, endPage + 1):
            pn = (page - 1) * 50
            filename = "" + str(page) + "页.html"
            fullurl = url + "&pn=" + str(pn)
            html = loadPage(fullurl, filename)
            writePage(html, filename)
            print("谢谢使用")
    
    
    if __name__ == '__main__':
        kw = input("请输入需要爬去的贴吧名")
        beginPage = int(input("请输入起始页:"))
        endPage = int(input("请输入结束页"))
    
        url = "http://tieba.baidu.com/f?"
        key = parse.urlencode({"kw": kw})
        fullurl = url + key
        tiebaSpider(fullurl, beginPage, endPage)

    No3:

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    import urllib.request
    from urllib import parse
    
    # 通过抓包的方式获取的url,并不是浏览器上显示的url
    url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null"
    
    # 完整的headers
    headers = {
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "X-Requested-With": "XMLHttpRequest",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    }
    
    # 用户接口输入
    key = input("请输入需要翻译的文字:")
    
    # 发送到web服务器的表单数据
    formdata = {
        "type": "AUTO",
        "i": key,
        "doctype": "json",
        "xmlVersion": "1.8",
        "keyfrom": "fanyi.web",
        "ue": "UTF-8",
        "action": "FY_BY_CLICKBUTTON",
        "typoResult": "true"
    }
    
    # 经过urlencode转码
    data = parse.urlencode(formdata).encode(encoding='UTF8')
    
    # 如果Request()方法里的data参数有值,那么这个请求就是POST
    # 如果没有,就是Get
    request = urllib.request.Request(url, data=data, headers=headers)
    
    print(str(urllib.request.urlopen(request).read(), 'utf-8'))

    No4:

    ajax

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    import urllib.request
    from urllib import parse
    
    url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    
    formdata = {
        "start": "0",
        "limit": "20"
    }
    
    data = parse.urlencode(formdata).encode(encoding='utf-8')
    
    request = urllib.request.Request(url, data=data, headers=headers)
    
    print(str(urllib.request.urlopen(request).read(), 'utf-8'))

     No5:

    handler

    import urllib.request
    
    http_handler = urllib.request.HTTPHandler(debuglevel=1)
    opener = urllib.request.build_opener(http_handler)
    request = urllib.request.Request("http://www.baidu.com/")
    response = opener.open(request)
    print(str(response.read(), 'utf-8'))

    No6:

    proxy

    import urllib.request
    
    proxyswitch = True
    httpproxy_handler = urllib.request.ProxyHandler({"http": "222.22.66.211"})
    nullproxy_handler = urllib.request.ProxyHandler({})
    
    if proxyswitch:
        opener = urllib.request.build_opener(httpproxy_handler)
    else:
        opener = urllib.request.build_opener(nullproxy_handler)
    
    urllib.request.install_opener(opener)
    request = urllib.request.Request("http://www.baidu.com/")
    response = urllib.request.urlopen(request)
    print(str(response.read(), 'utf-8'))

    No7:

    http

    import urllib.request
    
    test = "test"
    password = "123456"
    webserver = "192.168.21.52"
    passwordMgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
    passwordMgr.add_password(None, webserver, test, password)
    httpauth_handler = urllib.request.HTTPBasicAuthHandler(passwordMgr)
    opener = urllib.request.build_opener(httpauth_handler)
    request = urllib.request.Request("http://" + webserver)
    response = opener.open(request)
    print(str(response.read(), 'utf-8'))

    No8:

    cookie

    from urllib import request
    from urllib import parse
    from http import cookiejar
    
    cookie = cookiejar.CookieJar()
    cookie_handler = request.HTTPCookieProcessor(cookie)
    opener = request.build_opener(cookie_handler)
    opener.addheaders = [("User-Agent", "xxx")]
    url = "http://www.renren.com/PLogin.do"
    data = {"email": "xxx@163.com", "password": "xxx"}
    data = parse.urlencode(data).encode(encoding='UTF-8')
    request = request.Request(url, data=data)
    response = opener.open(request)
    print(str(response.read(), 'utf-8'))

    No9:

    抓取内涵段子

    from urllib import request
    import re
    
    
    class Spider:
        def __init__(self):
            self.page = 1
            self.switch = True
    
        def loadPage(self):
            print("正在下载数据...")
            url = "http://xiaohua.zol.com.cn/new/" + str(self.page) + ".html"
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
            req = request.Request(url, headers=headers)
            response = request.urlopen(req)
            html = str(response.read(), "gbk")
            pattern = re.compile('<divsclass="summary-text">(.*?)</div>', re.S)
            content_list = pattern.findall(html)
            self.dealPage(content_list)
    
        def dealPage(self, content_list):
            for item in content_list:
                item = item.replace("<p>", "").replace("</p>", "").replace("<br>", "").replace('<p class="bbsp">',
                                                                                               "").replace("&nbsp", "")
                self.writePage(item)
    
        def writePage(self, item):
            print("正在写入数据...")
            with open("duanzi.txt", "a") as f:
                f.write(item)
    
        def startWork(self):
            while self.switch:
                self.loadPage()
                command = input("如果继续爬取,请按回车(退出输入quit)")
                if command == "quit":
                    self.switch = False
                self.page += 1
            print("谢谢使用!")
    
    
    if __name__ == "__main__":
        duanzi = Spider()
        duanzi.startWork()

    No10:

    抓取百度贴吧美女图

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    from urllib import request
    from urllib import parse
    from lxml import etree
    
    
    def loadPage(url):
        print("正在下载...")
        req = request.Request(url)
        html = request.urlopen(req).read()
        # 解析HTML文档为HTML DOM模型
        content = etree.HTML(html)
        # 返回所有匹配成功的列表集合
        link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
    
        # link_list = content.xpath('//a[@class="j_th_tit"]/@href')
        for link in link_list:
            fulllink = "http://tieba.baidu.com" + link
            # 组合为每个帖子的链接
            print("link=" + link)
            loadImage(fulllink)
    
    
    # 取出每个帖子里的每个图片连接
    def loadImage(link):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
        req = request.Request(link, headers=headers)
        html = request.urlopen(req).read()
        # 解析
        content = etree.HTML(html)
        # 取出帖子里每层层主发送的图片连接集合
        link_list = content.xpath('//img[@class="BDE_Image"]/@src')
        # link_list = content.xpath('//div[@class="post_bubble_middle"]')
        # link_list = content.xpath('//img[@class="BDE_Image"]/@src')
        # 取出每个图片的连接
        for link in link_list:
            print("imglink" + link)
            writeImage(link)
    
    
    def writeImage(link):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
        # 文件写入
        req = request.Request(link, headers=headers)
        # 图片原始数据
        image = request.urlopen(req).read()
        # 取出连接后10位做为文件名
        filename = link[-10:]
        # 写入到本地磁盘文件内,美女
        with open(filename, "wb") as f:
            f.write(image)
            print("已经成功下载 " + filename)
    
    
    def tiebaSpider(url, beginPage, endPage):
        for page in range(beginPage, endPage + 1):
            pn = (page - 1) * 50
            # filename = "第" + str(page) + "页.html"
            fullurl = url + "&pn=" + str(pn)
            # print fullurl
            loadPage(fullurl)
            # print html
    
            print("谢谢使用")
    
    
    if __name__ == "__main__":
        kw = input("请输入需要爬取的贴吧名:")
        beginPage = int(input("请输入起始页:"))
        endPage = int(input("请输入结束页:"))
    
        url = "http://tieba.baidu.com/f?"
        key = parse.urlencode({"kw": kw})
        fullurl = url + key
        tiebaSpider(fullurl, beginPage, endPage)

    No11:

    抓取百度图片

    # -*- coding: utf-8 -*-
    """根据搜索词下载百度图片"""
    import re
    import sys
    import urllib
    
    import requests
    
    
    def get_onepage_urls(onepageurl):
        """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
        if not onepageurl:
            print('已到最后一页, 结束')
            return [], ''
        try:
            html = requests.get(onepageurl)
            html.encoding = 'utf-8'
            html = html.text
        except Exception as e:
            print(e)
            pic_urls = []
            fanye_url = ''
            return pic_urls, fanye_url
        pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
        fanye_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
        fanye_url = 'http://image.baidu.com' + fanye_urls[0] if fanye_urls else ''
        return pic_urls, fanye_url
    
    
    def down_pic(pic_urls):
        """给出图片链接列表, 下载所有图片"""
        for i, pic_url in enumerate(pic_urls):
            try:
                pic = requests.get(pic_url, timeout=15)
                string = str(i + 1) + '.jpg'
                with open(string, 'wb') as f:
                    f.write(pic.content)
                    print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
            except Exception as e:
                print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
                print(e)
                continue
    
    
    if __name__ == '__main__':
        keyword = '中国美女'  # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
        url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
        url_init = url_init_first + urllib.parse.quote(keyword, safe='/')
        all_pic_urls = []
        onepage_urls, fanye_url = get_onepage_urls(url_init)
        all_pic_urls.extend(onepage_urls)
    
        fanye_count = 0  # 累计翻页数
        while 1:
            onepage_urls, fanye_url = get_onepage_urls(fanye_url)
            fanye_count += 1
            # print('第页' % str(fanye_count))
            if fanye_url == '' and onepage_urls == []:
                break
            all_pic_urls.extend(onepage_urls)
    
        down_pic(list(set(all_pic_urls)))

     No12:

    知乎登录

    from bs4 import BeautifulSoup
    import requests
    import time
    
    
    def captcha(captcha_data):
        with open("captcha.jpg", "wb") as f:
            f.write(captcha_data)
        text = input("请输入验证码:")
        return text
    
    
    def zhihuLogin():
        sess = requests.Session()
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
        html = sess.get("https://www.zhihu.com/#signin", headers=headers).text
        bs = BeautifulSoup(html, "lxml")
        _xsrf = bs.find("input", attrs={"name": "_xsrf"}).get("value")
    
        captcha_url = "https://www.zhihu.com/captcha.gif?r=%dtype=login" % (time.time() * 1000)
        captcha_data = sess.get(captcha_url, headers=headers).content
        text = captcha(captcha_data)
        data = {
            "_xsrf": _xsrf,
            "email": "",
            "password": "",
            "captcha": text
        }
        response = sess.post("https://www.zhihu.com/login/email", data=data, headers=headers)
        print(response.text)
    
    
    if __name__ == "__main__":
        zhihuLogin()

    No13:

    json解析

    import urllib.request
    import json
    import jsonpath
    
    url = "http://www.lagou.com/lbs/getAllCitySearchLabels.json"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read()
    unicodestr = json.loads(html)
    city_list = jsonpath.jsonpath(unicodestr, "$..name")
    for item in city_list:
        print(item)
    
    array = json.dumps(city_list, ensure_ascii=False)
    
    with open("lagoucity.json", "wb+") as f:
        f.write(array.encode("utf-8"))

    No14:

    xml解析

    # -*- coding:utf-8 -*-
    
    import urllib.request
    from lxml import etree
    import json
    
    url = "http://www.qiushibaike.com/8hr/page/1/"
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
    request = urllib.request.Request(url, headers=headers)
    html = urllib.request.urlopen(request).read()
    text = etree.HTML(html)
    
    node_list = text.xpath('//div[contains(@id, "qiushi_tag")]')
    items = {}
    for node in node_list:
        username = node.xpath('./div/a/@title')[0]
        image = node.xpath('.//div[@class="thumb"]//@src')
        content = node.xpath('.//div[@class="content"]/span')[0].text
        zan = node.xpath('.//i')[0].text
        comments = node.xpath('.//i')[1].text
    
        items = {
            "username": username,
            "image": image,
            "content": content,
            "zan": zan,
            "comments": comments
        }
    
        with open("qiushi.json", "ab+") as f:
            f.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + b"
    ")

    No15:

    多线程

    # -*- coding:utf-8 -*-
    
    import threading
    from queue import Queue
    from lxml import etree
    import requests
    import json
    
    
    class ThreadCrawl(threading.Thread):
        def __init__(self, threadName, pageQueue, dataQueue):
            super(ThreadCrawl, self).__init__()
            self.threadName = threadName
            self.pageQueue = pageQueue
            self.dataQueue = dataQueue
            self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
    
        def run(self):
            print("启动" + self.threadName)
            while not CRAWL_EXIT:
                try:
                    page = self.pageQueue.get(False)
                    url = "http://www.qiushibaike.com/8hr/page/" + str(page) + "/"
                    content = requests.get(url, headers=self.headers)
                    self.dataQueue.put(content)
                except:
                    pass
            print("结束" + self.threadName)
    
    
    class ThreadParse(threading.Thread):
        def __init__(self, threadName, dataQueue, filename):
            super(ThreadParse, self).__init__()
            self.threadName = threadName
            self.dataQueue = dataQueue
            self.filename = filename
    
        def run(self):
            while not PARSE_EXIT:
                try:
                    html = self.dataQueue.get(False)
                    self.parse(html)
                except:
                    pass
    
        def parse(self, html):
            html = etree.HTML(html)
    
            node_list = text.xpath('//div[contains(@id, "qiushi_tag")]')
            items = {}
            for node in node_list:
                username = node.xpath('./div/a/@title')[0]
                image = node.xpath('.//div[@class="thumb"]//@src')
                content = node.xpath('.//div[@class="content"]/span')[0].text
                zan = node.xpath('.//i')[0].text
                comments = node.xpath('.//i')[1].text
    
                items = {
                    "username": username,
                    "image": image,
                    "content": content,
                    "zan": zan,
                    "comments": comments
                }
    
                with open("qiushi.json", "ab+") as f:
                    f.write(json.dumps(items, ensure_ascii=False).encode("utf-8") + b"
    ")
    
    CRAWL_EXIT = False
    PARSE_EXIT = False
    
    
    def main():
        pageQueue = Queue(10)
        for i in range(1, 11):
            pageQueue.put(i)
    
        dataQueue = Queue()
    
        filename = open("duanzi.json", "a")
    
        crawList = ["采集线程1号", "采集线程2号", "采集线程3号"]
        threadcrawl = []
        for threadName in crawList:
            thread = ThreadCrawl(threadName, pageQueue, dataQueue)
            thread.start()
            threadcrawl.append(thread)
    
        parseList = ["解析线程1号", "解析线程2号", "解析线程3号"]
        threadparse = []
        for threadName in parseList:
            thread = ThreadParse(threadName, dataQueue, filename)
            Thread.start()
            threadparse.append(thread)
    
        while not pageQueue.empty():
            pass
    
        global CRAWL_EXIT
        CRAWL_EXIT = True
    
        for thread in threadcrawl:
            thread.join()
            print("1")
    
    
    if __name__ == "__main__":
        main()

    No16:

    selenium、webdriver

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    
    driver = webdriver.PhantomJS(executable_path=r'D:phantomjsphantomjs-2.1.1-windowsinphantomjs.exe')
    driver.get("http://www.baidu.com/")
    
    driver.find_element_by_id("kw").send_keys(u"中国美女")
    # driver.find_element_by_id("su").click()
    driver.find_element_by_id("su").send_keys(Keys.ENTER)
    driver.save_screenshot("girl.png")
    driver.get_cookies()
    
    print(driver.page_source)
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    
    driver = webdriver.PhantomJS(executable_path=r'D:phantomjsphantomjs-2.1.1-windowsinphantomjs.exe')
    
    driver.get("https://www.douban.com/")
    driver.find_element_by_name("form_email").send_keys("mr.mao.tony@gmail.com")
    driver.find_element_by_name("form_password").send_keys("Mzj60055969alarm")
    driver.find_element_by_id("captcha_field").send_keys("short")
    driver.find_element_by_class_name("bn-submit").click()
    driver.save_screenshot("douban.png")

    No17:

    unittest测试

    from selenium import webdriver
    import unittest
    from bs4 import BeautifulSoup as bs
    
    
    class douyu(unittest.TestCase):
        def setUp(self):
            self.driver = webdriver.PhantomJS(executable_path=r'D:phantomjsphantomjs-2.1.1-windowsinphantomjs.exe')
    
        def testDouyu(self):
            self.driver.get("https://www.douyu.com/directory/all")
            while True:
                soup = bs(self.driver.page_source, "lxml")
                names = soup.find_all("h3", {"class": "ellipsis"})
                numbers = soup.find_all("span", {"class", "dy-num fr"})
                for name, number in zip(names, numbers):
                    print(u"观众人数:" + number.get_text().strip() + u"	房间名:" + name.get_text().strip())
    
                if self.driver.page_source.find("shark-pager-disable-next") != -1:
                    break
                self.driver.find_element_by_class_name("shark-pager-next").click()
    
        def tearDown(self):
            self.driver.quit()
    
    
    if __name__ == "__main__":
        unittest.main()

    No18:

    执行js

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    from selenium import webdriver
    import time
    
    driver = webdriver.PhantomJS(executable_path=r'D:phantomjsphantomjs-2.1.1-windowsinphantomjs.exe')
    driver.get("https://movie.douban.com/typerank?type_name=剧情&type=11&interval_id=100:90&action=")
    
    time.sleep(30)
    # 向下滚动10000像素
    js = "document.body.scrollTop=10000"
    #js="var q=document.documentElement.scrollTop=10000"
    
    #查看页面快照
    driver.save_screenshot("douban.png")
    
    # 执行JS语句
    driver.execute_script(js)
    time.sleep(20)
    
    #查看页面快照
    driver.save_screenshot("newdouban.png")
    
    driver.quit()

    No19:

    tesseract 识别图片中文字-验证码

  • 相关阅读:
    网络通信协议八之(传输层)TCP协议详解
    MongoDB数据库连接失败
    Flask web开发之路十四
    Flask web开发之路十三
    Flask web开发之路十二
    Flask web开发之路十一
    Flask web开发之路十
    NEERC 1999 Advertisement /// oj22646
    upper_bound() lower_bound() 用法
    palindrome 回文 /// Manacher算法
  • 原文地址:https://www.cnblogs.com/anni-qianqian/p/10019672.html
Copyright © 2011-2022 走看看