import com.mongodb.BasicDBObject
import com.mongodb.DBCollection
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import org.jsoup.select.Elements
public class ZhongYuan {
public static final DBCollection test = MongoUtils.getCollectionByName("name", "table",
"port")
public static final DBCollection html = MongoUtils.getCollectionByName("name", "table",
"port")
public static void main(String[] args){
// 循环遍历页面进行数据爬去
for(int i = 500 ; i<598 ;i++) {
String url = "http://sh.centanet.com/xiaoqu/g"+i+"/";
String result = RequestUtil.doGet(url, "GBK");
Document doc = Jsoup.parse(result);
//页面加载完成后对document进行处理,获取自己有用的数据
parseList(doc);
System.out.println("page=====>"+i);
}
}
private static void parseList(Document doc){
Elements elements = doc.select("div.house-listBox>div");
int j = 0;
for(Element element : elements){
String name = element.select(".house-title a").first().text();
html.save(new BasicDBObject("name",name).append("html",element.toString()))
String regionstr = element.select("div>div>p").first().text().replace(' ','-');
String region = regionstr.split("-")[0];
String address = null;
if(regionstr.split("-").length>1) {
address = regionstr.split("-")[1] + regionstr.split("-")[2];
} else {
address = regionstr.split("-")[1];
}
String price = element.select("div>div").last().select("p").first().text();
test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name)
.append("avg_price",price));
System.out.println(name);
j++;
}
System.out.println(j);
}
private static void parseList1(Document doc) {
Elements elements = doc.select("div.section>ul>li");
String name = null;
String region = null;
String price = null;
for (Element element : elements) {
if (element.toString().contains("room-img")) {
name = element.select("h5.room-name a").first().text();
Elements datas = element.select("p");
int i = 0;
for (Element data : datas) {
i++;
if (i == 2) {
price = data.text();
}
if (i == 4) {
region = data.text();
}
}
System.out.println(name + price + region);
test.insert(new BasicDBObject("city","上海").append("region",region).append("name",name)
.append("avg_price",price));
}
}
}
}相关doget请求自己封装了一个util,可以看看,上面的这一句String result = RequestUtil.doGet(url, "GBK");用的就是自己封装的util包,这里也可以使用jsoup自己封装的。
/**
* 发送get请求
* @param url
* @return
*/
public static String doGet(String url) {
return doGet(url, null, "UTF-8", false);
}
public static String doGet(String url, boolean encodeUrl) {
return doGet(url, null, "UTF-8", encodeUrl);
}
public static String doGet(String url, String charset) {
return doGet(url, null, charset, true);
}
public static String doGet(String url, Map<String, String> headers) {
return doGet(url, headers, "UTF-8", true);
}
public static String doGet(final String url, Map<String, String> headers, String charset, boolean encodeUrl) {
CloseableHttpClient client = HttpClients
.custom()
.setUserAgent(USERAGENT_CHROME)
.build();
CloseableHttpResponse response = null;
String result = null;
String requestUrl = url;
try {
if(encodeUrl) {
requestUrl = encodingUrl(url, charset);
}
HttpGet httpGet = new HttpGet(requestUrl);
// RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(3000).setConnectTimeout(3000).build();//设置请求和传输超时时间
// httpGet.setConfig(requestConfig);
if(headers != null) {
for(Map.Entry<String, String> entry : headers.entrySet()) {
httpGet.addHeader(entry.getKey(), entry.getValue());
}
}
response = client.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
if(statusCode == 200) {
result = EntityUtils.toString(response.getEntity(), charset);
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if(response != null) {
try {
response.close();
} catch (IOException e) {
}
}
if(client != null) {
try {
client.close();
} catch (IOException e) {
}
}
}
return result;
}