zoukankan      html  css  js  c++  java
  • 使用python和java两种方式来完成下载网页,并保存成文件,

    这里要注意的是保存成utf-8,无BOM的格式,事实上我觉得生成BOM的效果会更好,更加清晰,不过要找实现这个的代码还是比较麻烦就没有弄了

    package com.jd.cis;

    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.FileOutputStream;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.io.OutputStreamWriter;
    import java.io.Writer;
    import java.net.HttpURLConnection;
    import java.net.MalformedURLException;
    import java.net.URL;

    import org.apache.http.impl.io.ChunkedInputStream;

    public class GzipFetchHtmlTest {
        public static void main(String[] args) throws IOException
        {
    //        ChunkedInputStream  aa =new ChunkedInputStream ();
    //            HttpURLConnection conn=new HttpURLConnection();
                URL url=new URL("http://www.baidu.com");
                HttpURLConnection conn=(HttpURLConnection)url.openConnection();
                conn.setRequestMethod("GET");
                conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)");
    //            conn.setChunkedStreamingMode(2);
                conn.connect();
                BufferedReader reader=new BufferedReader(new InputStreamReader(conn.getInputStream()));
                System.out.println("======================");
                String lines;
                reader.mark(999999);
                while((lines=reader.readLine())!=null)
                {
                    System.out.println(lines);
                }
                reader.reset();
    //            FileWriter fw=new FileWriter("c:/baidu_url_fetch.html");
    //            BufferedWriter  bf=new BufferedWriter (fw);
                BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("c:/baidu_url_fetch.html"), "UTF-8"));

                while((lines=reader.readLine())!=null)
                {
                   
                    bf.write(lines);
                    reader.mark(30000);
                    if(reader.readLine()!=null){
                        bf.newLine();
                       
                    }
                    reader.reset();
                }
                bf.flush();
                bf.close();
                reader.close();
                conn.disconnect();
        }

    }

    以下是python代码,

    #!/usr/bin/env python
    #encoding=utf-8
    import sys
    reload(sys)
    sys.setdefaultencoding("utf-8")

    #url="http://www.yuncheng.com/searchcate.aspx?page=4&cate=%E5%9B%BE%E4%B9%A6&subcate=%E5%8E%9F%E5%88%9B%E6%96%87%E5%AD%A6&level3=%E7%8E%84%E5%B9%BB&level4=%E4%B8%9C%E6%96%B9%E7%8E%84%E5%B9%BB#hd "
    #url=url.strip()
    #import urllib2
    #for i in xrange(2,30):
    #    print i
    #    html=urllib2.urlopen(url).read()
    #    f=open("c:/22/%s.txt"%i,"w")
    #    f.write(html)
    #    f.close()
    import codecs
    import urllib2,types
    url="http://www.baidu.com"
    #f=open("c:/baidu_fetch_by_py.html","w")
    f=codecs.open("c:/baidu_fetch_by_py.html","w",encoding='UTF-8')
    html=urllib2.urlopen(url).read()
    if type(html)==types.UnicodeType:
        html=html.encode("utf-8")
        print "html covert"
    else:
        html=html.decode("gbk","ignore").encode("utf-8","ignore")
        print "convert html"
    f.write(html)
    f.close()

    两个生成的文件在md5sum下校验是一样的

    最后生成的文件的字节数也是一样的,

    都是使用了语言内置的函数,没有外部依赖

  • 相关阅读:
    函数
    文件的基本操作
    c语言程序设计案例教程(第2版)笔记(一)—零散、输入输出、最小公倍数、选择排序、冒泡排序
    c语言中的rand()函数用法
    c语言 error C4996: 'strupr': The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name
    Python之列表生成式、生成器
    Python之迭代器
    Python之装饰器
    Linux之线程相关命令及常用命令
    重写、重构、重载区别
  • 原文地址:https://www.cnblogs.com/lexus/p/2391706.html
Copyright © 2011-2022 走看看