接口文档:http://lbs.baidu.com/index.php?title=webapi/guide/webservice-placeapi#service-page-anchor-1-3
1、示例中的数据是从链家拿到的北京小区信息,包含小区名和所在行政区,例如朱雀门/西城,实例化CheckEstateData(朱雀门, 西城)
2、API中region可以多维度组合,为防止全国或同行政区有重名,例如北京有多个万达广场,最好加上region=市+区
3、有一些百度API会把小区名莫名解释成错误的名称,例如《东四西大街50号院》->《东4西大街》,这类数据不会直接更新数据,set status = -1,需要手动维护数据
# -*- coding: utf-8 -*- import urllib2, json, sys, time reload(sys) sys.setdefaultencoding("utf8") GLOBAL_URL = "http://api.map.baidu.com/place/v2/search?region=北京%s&city_limit=true&query=%s&page_size=10&output=json&ak=%s" GLOBAL_AK = "" GLOBAL_SQL = "SELECT `name`, `district` FROM estate WHERE location IS NULL and `status` IS NULL LIMIT 100 " COLUMN_LIST = ["area", "address", "location", "province", "city", "uid"] UPDATE_SQL = """ UPDATE estate SET %s, source_name = '%s', `status` = %s, `result` = '%s' WHERE `name` = '%s' AND `district` = '%s' """ """ estate的status字段 更新数据状态 小区名全匹配且全属性 0 小区名全匹配属性不全 1 小区名全匹配无detail=1 2 小区名无全匹配第一个detail=1的全属性数据 3 小区名无全匹配第一个detail=1的属性不全数据 4 小区名无全匹配没有detail=1的数据 5 没搜到任何数据 -1 """
class CheckEstate:
def __init__(self):
# 本地存
pass
class CheckEstateData: def __init__(self, name, district): self.name = name self.district = district self.error = None self.msg = None self.__get_data__() if self.datas: self.do() def __get_data__(self): try: print "URL: %s" % (GLOBAL_URL % (self.district, self.name, GLOBAL_AK)) html = urllib2.urlopen(GLOBAL_URL % (self.district, self.name, GLOBAL_AK)) b = html.read() c = json.loads(b) if c["status"] == 0 and c["message"] == "ok": self.datas, self.error = c["results"], None else: self.datas, self.error = None, "ERR: API return %s" % c["message"] except Exception, e: self.datas, self.error = None, "ERR: get data %s %s" % (self.name, str(e)) def update(self, sql): print "INFO: sql %s" % sql s = CheckEstate(sql) if s.error: self.error = "ERR: UPDATE ERR, %s" % s.error else: self.msg = "INFO: %s ok" % self.name def check_colunm(self, data): if not set(COLUMN_LIST).difference([k for k in data]): return True return False def get_info(self): for d in self.datas: if d["name"] == self.name and "detail" in d and d["detail"] == 1: r = check_colunm(d) if r: return d, 0 else: return d, 1 # if self.name in [row["name"] for row in self.datas]: # return None, 2 for d in self.datas: if "detail" in d and d["detail"] == 1: r = self.check_colunm(d) if r: return d, 3 else: return d, 4 return None, -1 def do(self): r, status = self.get_info() if r: value = ", ".join( [ "%s = '%s'" % (k, json.dumps(r[k])) if k == "location" else "%s = '%s'" % (k, r[k]) for k in [key for key in r if key in COLUMN_LIST] ] ) sql = UPDATE_SQL % (value, r["name"], status, json.dumps(self.datas, ensure_ascii=False), self.name, self.district) else: sql = "UPDATE estate set `status` = %s, `result` = '%s' WHERE `name` = '%s' and district = '%s'" % (status, json.dumps(self.datas, ensure_ascii=False), self.name, self.district) self.update(sql) def get_estate_info(): c = CheckEstate(None) if c.error: print c.error return c.error for d in c.r: estate = CheckEstateData(d["name"], d["district"]) if estate.error: print estate.error else: print estate.msg time.sleep(0.5) return None if __name__ == "__main__": get_estate_info()
为防止链家和百度的小区名有差异,在存储时将API的所有数据本地存一份
CREATE TABLE `estate` ( `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, `name` varchar(50) NOT NULL COMMENT '小区名', `source_name` varchar(50) DEFAULT NULL COMMENT 'baidu小区原名', `district` varchar(20) DEFAULT NULL COMMENT '区(链家侧数据)', `area` varchar(20) DEFAULT NULL COMMENT '区(百度侧数据)', `street_id` varchar(50) DEFAULT NULL COMMENT '街景图id', `address` varchar(100) DEFAULT NULL COMMENT '地址', `location` json DEFAULT NULL COMMENT '坐标', `province` varchar(30) DEFAULT NULL COMMENT '省份', `city` varchar(30) DEFAULT NULL COMMENT '城市', `uid` varchar(100) DEFAULT NULL COMMENT 'poi的唯一标示,可用于详情检索', `status` tinyint(4) DEFAULT NULL COMMENT '更新数据状态,具体含义看代码', `result` json DEFAULT NULL COMMENT '接口返回的数据', PRIMARY KEY (`id`), KEY `idx_name` (`name`) ) ENGINE=InnoDB AUTO_INCREMENT=8192 DEFAULT CHARSET=utf8mb4;