尽可能减少 数据处理中的内存消耗
服务器成本 时间成本
'''
{"ad_slots_id":1002,"uuid":"f18343c2-3e09-4abd-b3c5-e00cf33ff84d","industry_pid":0,"industry_id":0,"ip":"3661949473","site":72,"address":"https://info.b2b168.com/s168-54296673.html","create_date":"2019-01-02 14:56:58","ad_id":"33988392","uid":"33988392","keyword":"u71c3u70e7u673au914du4ef6","pageinfo":""}
{"ad_slots_id":1002,"uuid":"f18343c2-3e09-4abd-b3c5-e00cf33ff84d","industry_pid":0,"industry_id":0,"ip":"3661949473","site":72,"address":"https://info.b2b168.com/s168-54296673.html","create_date":"2019-01-02 14:56:58","ad_id":"50017820","uid":"50017820","keyword":"u5de5u4e1au6cb9u70dfu51c0u5316u5668","pageinfo":""}
'''
def fileRows(f, debug=False):
l = []
global pass_ip
with open(f, 'r') as fr:
for i in fr:
try:
# d = json.loads(i)
i=i.strip('
')
l.append(i)
except Exception as e:
if debug:
print(e)
print(i)
print(f)
fr.close()
del fr
return l
for f in file_list:
if target_date not in f:
continue
rows_ = fileRows(f)
print(f, ':', len(rows_))
rows += rows_
del rows_
d = {}
for i in rows:
if 'uid' not in i:
continue
try:
i = json.loads(i)
uid, uuid, long_ip = i['uid'], i['uuid'], i['ip']
if uid not in d:
d[uid] = {}
d[uid]['uuid'], d[uid]['long_ip'], d[uid]['pv'] = [], [], 0
d[uid]['pv'] += 1
d[uid]['uuid'].append(uuid)
d[uid]['long_ip'].append(long_ip)
except Exception as e:
if 4 > 91:
print(e)
数据预处理阶段
数据的结构化处理会消耗不必要的内存,比如多行的json字符串构成的文件的逐行字符串转json
在数据的业务层面,逐行结构化,占用接近恒定的内存,增加对内存的控制性