这里要注意的是保存成utf-8,无BOM的格式,事实上我觉得生成BOM的效果会更好,更加清晰,不过要找实现这个的代码还是比较麻烦就没有弄了
package com.jd.cis;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.http.impl.io.ChunkedInputStream;
public class GzipFetchHtmlTest {
public static void main(String[] args) throws IOException
{
// ChunkedInputStream aa =new ChunkedInputStream ();
// HttpURLConnection conn=new HttpURLConnection();
URL url=new URL("http://www.baidu.com");
HttpURLConnection conn=(HttpURLConnection)url.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)");
// conn.setChunkedStreamingMode(2);
conn.connect();
BufferedReader reader=new BufferedReader(new InputStreamReader(conn.getInputStream()));
System.out.println("======================");
String lines;
reader.mark(999999);
while((lines=reader.readLine())!=null)
{
System.out.println(lines);
}
reader.reset();
// FileWriter fw=new FileWriter("c:/baidu_url_fetch.html");
// BufferedWriter bf=new BufferedWriter (fw);
BufferedWriter bf = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("c:/baidu_url_fetch.html"), "UTF-8"));
while((lines=reader.readLine())!=null)
{
bf.write(lines);
reader.mark(30000);
if(reader.readLine()!=null){
bf.newLine();
}
reader.reset();
}
bf.flush();
bf.close();
reader.close();
conn.disconnect();
}
}
以下是python代码,
#!/usr/bin/env python
#encoding=utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
#url="http://www.yuncheng.com/searchcate.aspx?page=4&cate=%E5%9B%BE%E4%B9%A6&subcate=%E5%8E%9F%E5%88%9B%E6%96%87%E5%AD%A6&level3=%E7%8E%84%E5%B9%BB&level4=%E4%B8%9C%E6%96%B9%E7%8E%84%E5%B9%BB#hd "
#url=url.strip()
#import urllib2
#for i in xrange(2,30):
# print i
# html=urllib2.urlopen(url).read()
# f=open("c:/22/%s.txt"%i,"w")
# f.write(html)
# f.close()
import codecs
import urllib2,types
url="http://www.baidu.com"
#f=open("c:/baidu_fetch_by_py.html","w")
f=codecs.open("c:/baidu_fetch_by_py.html","w",encoding='UTF-8')
html=urllib2.urlopen(url).read()
if type(html)==types.UnicodeType:
html=html.encode("utf-8")
print "html covert"
else:
html=html.decode("gbk","ignore").encode("utf-8","ignore")
print "convert html"
f.write(html)
f.close()
两个生成的文件在md5sum下校验是一样的
最后生成的文件的字节数也是一样的,
都是使用了语言内置的函数,没有外部依赖