zoukankan      html  css  js  c++  java
  • 【python3两小时快速入门】入门笔记03:简单爬虫+多线程爬虫

    作用,之间将目标网页保存金本地

    1、爬虫代码修改自网络,目前运行平稳,博主需要的是精准爬取,数据量并不大,暂未加多线程。

    2、分割策略是通过查询条件进行分类,循环启动多条线程。

    1、单线程简单爬虫(第二次整理)

    import urllib.parse
    import urllib.request
    
    import os
    import datetime
    import json
    
    
    #获取页面数据,返回整张网页
    def getHtml(url,values):
        user_agent='Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
        headers = {'User-Agent':user_agent}
        data = urllib.parse.urlencode(values)
        response_result = urllib.request.urlopen(url+'?'+data).read()
        html = response_result.decode('utf-8')
        return html
    
    #组装请求参数
    def requestCnblogs(index):
        print('请求数据')
        url = 'http://xxx解析链接xxx.com/'
        value= {
            'param1': '',
            'param2': '',
            'param3': '308',
            'page': index
        }
        result = getHtml(url,value)
        return result
    
    
    #print(requestCnblogs(1))
    
    
    
    #通过IO流写出文件
    def writeToTxt(html,file_path):
        print(file_path)
        try:
    
            fp = open(file_path,"w+",encoding='utf-8')
    
            fp.write(html)
    
            fp.close()
        except IOError:
            print("fail to open file")
    
    
    
    #创建文件夹
    def createFile():
        # date = datetime.datetime.now().strftime('%Y-%m-%d')
        path = r'P:Users' + '/foldername'
        if os.path.exists(path):
    
            return path
        else:
            os.mkdir(path)
    
            return path
    
    
    #保存页面
    def saveBlogs():
        i=1;
        while 1==1:
    
            try:
                print('request for '+str(i)+'...')
                blogs = requestCnblogs(i)
                #保存到文件
                path = createFile()
                writeToTxt(blogs,path+'/filenames'+ str(i) +'.txt')
                print('第'+ str(i) +'页已经完成')
                i = i + 1;
            except IOError:
                print("sleep 10min and retry")
        return 'success'
    
    
    
    #开始搞事情
    result = saveBlogs()
    print(result)
    

    并发爬虫:https://www.cnblogs.com/huohuohuo1/p/9064759.html

    2、多线程爬虫(第二次整理)

    这里是利用安全的队列保证线程安全,首先要将地址放入队列(摘抄自网络)

    # coding=utf-8
    import threading, queue, time, urllib
    from urllib import request
    baseUrl = 'http://www.pythontab.com/html/pythonjichu/'
    urlQueue = queue.Queue()
    for i in range(2, 10):
     url = baseUrl + str(i) + '.html'
     urlQueue.put(url)
     #print(url)
    def fetchUrl(urlQueue):
     while True:
      try:
       #不阻塞的读取队列数据
       url = urlQueue.get_nowait()
       i = urlQueue.qsize()
      except Exception as e:
       break
      print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url))
      try:
       response = urllib.request.urlopen(url)
       responseCode = response.getcode()
      except Exception as e:
       continue
      if responseCode == 200:
       #抓取内容的数据处理可以放到这里
       #为了突出效果, 设置延时
       html = response.read().decode('utf-8')
       time.sleep(1)
       print(html)
    if __name__ == '__main__':
     startTime = time.time()
     threads = []
     # 可以调节线程数, 进而控制抓取速度
     threadNum = 4
     for i in range(0, threadNum):
      t = threading.Thread(target=fetchUrl, args=(urlQueue,))
      threads.append(t)
     for t in threads:
      t.start()
     for t in threads:
      #多线程多join的情况下,依次执行各线程的join方法, 这样可以确保主线程最后退出, 且各个线程间没有阻塞
      t.join()
     endTime = time.time()
     print ('Done, Time cost: %s ' % (endTime - startTime))

    3、自己改进了下(未整理,但正在使用)

    # coding=utf-8
    import threading, queue, time, urllib
    import urllib.parse
    import urllib.request
    import os
    import datetime
    import json
    
    from urllib import request
    baseUrl = 'http://www.xxxxxxxxx.cn/xxx/402/'
    urlQueue = queue.Queue()
    
    
    def writeToTxt(html, file_path):
     print(file_path)
     try:
      # 这里直接write item 即可,不要自己给序列化在写入,会导致json格式不正确的问题
      fp = open(file_path, "w+", encoding='utf-8')
    
      fp.write(html)
    
      fp.close()
     except IOError:
      print("fail to open file")
    
    
    #创建文件夹
    def createFiles():
        # date = datetime.datetime.now().strftime('%Y-%m-%d')
        path = r'P:Users3' + '/402'
        if os.path.exists(path):
    
            return path
        else:
            os.mkdir(path)
    
            return path
    
    for i in range(1, 881):
     url = baseUrl + str(i) + "/"
     urlQueue.put(url)
     #print(url)
    def fetchUrl(urlQueue):
     while True:
      try:
       #不阻塞的读取队列数据
       url = urlQueue.get_nowait()
       i = urlQueue.qsize()
      except Exception as e:
       break
      print ('Current Thread Name %s, Url: %s ' % (threading.currentThread().name, url))
      try:
       response = urllib.request.urlopen(url)
       responseCode = response.getcode()
      except Exception as e:
       continue
      if responseCode == 200:
       #抓取内容的数据处理可以放到这里
       #为了突出效果, 设置延时
       html = response.read().decode('utf-8')
       path = createFiles()
       writeToTxt(html, path + '/filename' + str(i) + '.txt')
    
    
    
    if __name__ == '__main__':
     startTime = time.time()
     threads = []
     # 可以调节线程数, 进而控制抓取速度
     threadNum = 4
     for i in range(0, threadNum):
      t = threading.Thread(target=fetchUrl, args=(urlQueue,))
      threads.append(t)
     for t in threads:
      t.start()
     for t in threads:
      #多线程多join的情况下,依次执行各线程的join方法, 这样可以确保主线程最后退出, 且各个线程间没有阻塞
      t.join()
     endTime = time.time()
     print ('Done, Time cost: %s ' % (endTime - startTime))
    
    
    
    
     def saveBlogs():
      i = 51;  # 873
      while 1 == 1:
    
       try:
        print('request for ' + str(i) + '...')
        blogs = requestCnblogs(i)
        # 保存到文件
        path = createFiles()
        writeToTxt(blogs, path + '/nongyeyinhang' + str(i) + '.txt')
        print('第' + str(i) + '页已经完成')
        i = i + 1;
       except IOError:
        print("sleep 10min and retry")
      return 'success'
    
    
    

    个人记录,处理下载的文件写入数据库(java代码):

    package com.zzt.spider;
    
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.sql.Connection;
    import java.sql.DriverManager;
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.sql.SQLException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.Scanner;
    
    /**
     * 读取爬取的数据
     * @author ZX
     *
     */
    public class ReadSpiderData3 {
    	public static void main(String[] args) {
    		List<String> folderList = new ArrayList<>();
    		//文件目录
    		File fileDir = new File("P:\Users3\102");
    		if(!fileDir.exists()){
    			  return;
    		  }
    		String[] list = fileDir.list();
    		for(String str :list) {//列出所有文件名		
    			readTxt("P:\Users3\102\"+str);
    			//return;
    		}
    		Scanner sc = new Scanner(System.in);
    	}
    	
    	
    	public static void readTxt(String path) {
    		try {
    			File file = new File(path);
    			BufferedReader br = new BufferedReader(new FileReader(file));
    			String line   = null;
    			int isVaribales=-1;
    			int lineCount=-1;//取1-20行有效数据
    			while ((line = br.readLine()) != null) {
    				
    			 	if(line.contains("<th>SWIFT CODE</th>")){
    			 		isVaribales=1;
                	}
    			 	if(isVaribales==1) {
    			 		lineCount++;
    			 		if(lineCount>=1&&lineCount<84) {
    			 			if(line==null||"".equals(line.trim())) {
    			 				continue;
    			 			}
    			 			
    			 			
    			 			System.out.println(line);
    			 			
    	
    			 			//insertBank(code, name, phone, addr, "170");
    			 			
    			 			
    			 		}
    			 	}
    			 	if(line.contains("<div class="page">")){
    			 		isVaribales=-1;
    			 	} 
    			 }
    			 if (br != null) {
    			     br.close();
    			 }
    		} catch (FileNotFoundException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		} catch (IOException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
        
    	}
    	/**
    	 * 插入一条数据
    	 */
    	public static void insertBank(String BANK_CODE,String BANK_NAME,String BANK_PHONE,String BANK_ADDR,String BANK_NO) {
    		Connection connerction= createConn();
    		String sql="insert into SP_BANK_DETILS_S2 (BANK_CODE,BANK_NAME,BANK_PHONE,BANK_ADDR,BANK_NO) values(?,?,?,?,?)";
    		try {
    			PreparedStatement pstmt = connerction.prepareStatement(sql);
    			pstmt.setString(1, BANK_CODE);
    			pstmt.setString(2, BANK_NAME);
    			pstmt.setString(3, BANK_PHONE);
    			pstmt.setString(4, BANK_ADDR);
    			pstmt.setString(5, BANK_NO);
    			pstmt.executeUpdate();
    			closeConn(null, pstmt, connerction);
    		} catch (SQLException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    	}
    	/**
    	 * 获取jdbc链接
    	 * @return
    	 */
    	private static Connection createConn(){
    		Connection conn = null;
    		try {
    			Class.forName("com.mysql.jdbc.Driver");
    			conn =DriverManager.getConnection("jdbc:mysql://192.168.0.100:3306/payrecdb?characterEncoding=utf8","name","pwd");
    		} catch (ClassNotFoundException e) {
    			e.printStackTrace();
    		} catch (SQLException e) {
    			e.printStackTrace();
    		}
    		return conn;
    	}
    	/**
    	 * 关闭所有资源
    	 * @param rs
    	 * @param stmt
    	 * @param conn
    	 */
    	private static void closeConn(ResultSet rs,PreparedStatement stmt,Connection conn){
    		try {
    			if(rs!=null)
    			rs.close();
    			if(stmt!=null)
    			stmt.close();
    			if(conn!=null)
    			conn.close();
    		} catch (SQLException e) {
    			// TODO Auto-generated catch block
    			e.printStackTrace();
    		}
    		
    	}
    	
    }
    
  • 相关阅读:
    Java数组(1):数组与多维数组
    Java内部类(5):应用例
    Java内部类(4):静态内部类&接口内部类
    Java内部类(3):局部内部类
    Java内部类(2):普通的成员内部类
    Java内部类(1):概述
    Java中验证编码格式的一种方法
    Mybatis高级结果映射
    Mybatis Guide
    Java泛型(11):潜在类型机制
  • 原文地址:https://www.cnblogs.com/the-fool/p/11054115.html
Copyright © 2011-2022 走看看