zoukankan      html  css  js  c++  java
  • 爬虫平台设置代理ip

    首先从国外一个网站爬取了免费的代理ip信息存到mongodb中;接着代码设置:

    在爬虫客户端抽象类中添加属性:

    设置代理的代码其实就以下几句:

    firefoxProfile.setPreference("network.proxy.type", 1);
    firefoxProfile.setPreference("network.proxy.no_proxies_on", "localhost, 127.0.0.1"); 

    firefoxProfile.setPreference("network.proxy.http", proxyHttp.getIp());
    firefoxProfile.setPreference("network.proxy.http_port", proxyHttp.getPort());

    firefoxProfile.setPreference("network.proxy.ssl", proxyHttps.getIp());
    firefoxProfile.setPreference("network.proxy.ssl_port", proxyHttps.getPort());

    以下是具体实现代码:

    /**
    * 爬虫客户端抽象类
    * 其生命周期如下
    * setSpiderDao→setRootUrl→setParamsMap→init→runSpider→returnData→destory
    */
    public abstract class SpiderClient {

    private static final Logger logger = LoggerFactory.getLogger(SpiderClient.class);
    protected SpiderDao spiderDao;
    protected SpiderData spiderData;
    protected WebDriver driver;
    protected String rootUrl;
    protected Map<String, Object> params;
    private String collection;
    protected boolean enableProxy;

    //.. get set

    /**
    * 初始化工作
    */
    public void init(){

    FirefoxProfile firefoxProfile = new FirefoxProfile(); 

    // 去掉css
    firefoxProfile.setPreference("permissions.default.stylesheet", 2);
    // 去掉图片
    firefoxProfile.setPreference("permissions.default.image", 2);
    // 去掉flash
    firefoxProfile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", false);
    //设置默认下载
    // 设置是否显示下载进度框
    firefoxProfile.setPreference("browser.download.manager.showWhenStarting", false);
    // browser.download.folderList 设置Firefox的默认 下载 文件夹。0是桌面;1是“我的下载”;2是自定义
    firefoxProfile.setPreference("browser.download.folderList", 2);
    // ,如果使用自定义路径,必须要将browser.download.folderList设置为2
    firefoxProfile.setPreference("browser.download.dir", System.getProperty("java.io.tmpdir")+"material_images");
    // 设置哪种类型的文件下载不询问直接下载
    firefoxProfile.setPreference("browser.helperApps.neverAsk.saveToDisk","image/gif,image/png,image/jpeg,image/bmp,image/webp");
    /*firefoxProfile.setPreference("browser.helperApps.neverAsk.saveToDisk",
    "application/zip,text/plain,application/vnd.ms-excel,text/csv,text/comma-separated-values,application/octet-stream,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.openxmlformats-officedocument.wordprocessingml.document");
    */
    //proxy
    if(enableProxy){
    firefoxProfile.setPreference("network.proxy.type", 1);
    firefoxProfile.setPreference("network.proxy.no_proxies_on", "localhost, 127.0.0.1");

    ProxyIP proxyHttp = getProxyIPForHttp();
    if(proxyHttp!=null){
    firefoxProfile.setPreference("network.proxy.http", proxyHttp.getIp());
    firefoxProfile.setPreference("network.proxy.http_port", proxyHttp.getPort());
    logger.info("Set http proxy: {}:{}",proxyHttp.getIp(),proxyHttp.getPort());
    }
    ProxyIP proxyHttps = getProxyIPForHttps();
    if(proxyHttps!=null){
    firefoxProfile.setPreference("network.proxy.ssl", proxyHttps.getIp());
    firefoxProfile.setPreference("network.proxy.ssl_port", proxyHttps.getPort());
    logger.info("Set https proxy: {}:{}",proxyHttps.getIp(),proxyHttps.getPort());
    }
    }
    this.driver = new FirefoxDriver(firefoxProfile);
    this.driver.manage().timeouts().implicitlyWait(30, TimeUnit.SECONDS);
    this.spiderData = new SpiderData();
    this.spiderData.setIds(new ArrayList<String>());

    }

    //先从China的ip获取(信号相对好,网速快)

    private ProxyIP getProxyIPForHttp(){
    MongoSpiderDao mongoSpiderDao = (MongoSpiderDao) spiderDao;
    List<ProxyIP> list = mongoSpiderDao.getProxyIP("HTTP", "China", 20); //从mongodb中查询20条ip数据
    if(list==null || list.isEmpty()){
    return null;
    }
    return list.get(RandomUtils.nextInt(0, list.size()));
    }
    private ProxyIP getProxyIPForHttps(){
    MongoSpiderDao mongoSpiderDao = (MongoSpiderDao) spiderDao;
    List<ProxyIP> list = mongoSpiderDao.getProxyIP("HTTPS", "China", 20);
    if(list==null || list.isEmpty()){
    return null;
    }
    return list.get(RandomUtils.nextInt(0, list.size()));
    }

    ...

    }

    有个很好的自动化获取有效免费代理ip的项目:https://github.com/yzf233/IPProxyTool,只需要跑一下命令即可;

  • 相关阅读:
    85. Maximal Rectangle
    120. Triangle
    72. Edit Distance
    39. Combination Sum
    44. Wildcard Matching
    138. Copy List with Random Pointer
    91. Decode Ways
    142. Linked List Cycle II
    异或的性质及应用
    64. Minimum Path Sum
  • 原文地址:https://www.cnblogs.com/yzf666/p/7150617.html
Copyright © 2011-2022 走看看