1 map()是python的高阶函数,python高阶函数是指可以把函数作为参数的函数,函数式编程就是指这种高度抽象的编程范式.
2 由于map()函数中的第二个参数向第一个参数传参时,是一个一个传,利用这个特点,可是实现字符串的分割.
a = 12300000 def shuchu(k): return k print(map(shuchu, str(a))) b = list(map(shuchu, str(a))) print(b) # <map object at 0x7ff47f3c0828> # ['1', '2', '3', '0', '0', '0', '0', '0'] # 直接用字典对list是无法映射的,下面的写法错误 d = {1:'(', -1:')'} c = list(map(d, [1,1,-1])) # TypeError: 'dict' object is not callable
3 map函数可以做两个dataframe表格的某些列的融合.
import pandas as pd import numpy as np df1 = pd.DataFrame( {'A':[1,2,3,'df2的索引里没有这个,所以融合后是空'], 'B':['a','b','c','d'], 'C':['Tom','Jack','Bob','roushi'] }) print(df1) df2 = pd.DataFrame( {'A':[1,2,3,4], 'B':[6,7,8,9]}) print(df2) # 相当于从df1的A列与df2的索引融合后,再做映射 df1['df1的A列与df2的索引做融合后再映射'] = df1['A'].map(df2['B']) print(df1) # A B C # 0 1 a Tom # 1 2 b Jack # 2 3 c Bob # 3 df2的索引里没有这个,所以融合后是空 d roushi # A B # 0 1 6 # 1 2 7 # 2 3 8 # 3 4 9 # A B C df1的A列与df2的索引做融合后再映射 # 0 1 a Tom 7.0 # 1 2 b Jack 8.0 # 2 3 c Bob 9.0 # 3 df2的索引里没有这个,所以融合后是空 d roushi NaN
4 基本用法
import pandas as pd from pandas import Series, DataFrame data = DataFrame({'food':['bacon','pulled pork','bacon','Pastrami', 'corned beef','Bacon','pastrami','honey ham','nova lox'], 'ounces':[4,3,12,6,7.5,8,3,5,6]}) meat_to_animal = { 'bacon':'pig', 'pulled pork':'pig', 'pastrami':'cow', 'corned beef':'cow', 'honey ham':'pig', 'nova lox':'salmon' } # Python lower() 方法转换字符串中所有大写字符为小写。 因为meat_to_animal中的食物是小写,food列的食物是大写 data['animal'] = data['food'].map(str.lower).map(meat_to_animal) print(data) print(data.info()) a = data['food'].map(lambda x: meat_to_animal[x.lower()]) print(a) # food ounces animal # 0 bacon 4.0 pig # 1 pulled pork 3.0 pig # 2 bacon 12.0 pig # 3 Pastrami 6.0 cow # 4 corned beef 7.5 cow # 5 Bacon 8.0 pig # 6 pastrami 3.0 cow # 7 honey ham 5.0 pig # 8 nova lox 6.0 salmon # <class 'pandas.core.frame.DataFrame'> # RangeIndex: 9 entries, 0 to 8 # Data columns (total 3 columns): # food 9 non-null object # ounces 9 non-null float64 # animal 9 non-null object # dtypes: float64(1), object(2) # memory usage: 296.0+ bytes # None # 0 pig # 1 pig # 2 pig # 3 cow # 4 cow # 5 pig # 6 cow # 7 pig # 8 salmon # Name: food, dtype: object import pandas as pd df1 = pd.DataFrame({'a':[1,2,3,4,5], 'b':['一','二','三','四','五']}) df2 = pd.DataFrame({'c':[5,4,2,1,2,3]}) d = df2['c'].map(dict(zip(df1['a'],df1['b']))) print(d) # 0 五 # 1 四 # 2 二 # 3 一 # 4 二 # 5 三 # Name: c, dtype: object
import pandas as pd from pandas import Series, DataFrame index = pd.date_range('2017-08-15', periods=10) ser = Series(list(range(10)), index=index) print(ser) ser.index = ser.index.map(lambda x: x.day) print(ser) # 2017-08-15 0 # 2017-08-16 1 # 2017-08-17 2 # 2017-08-18 3 # 2017-08-19 4 # 2017-08-20 5 # 2017-08-21 6 # 2017-08-22 7 # 2017-08-23 8 # 2017-08-24 9 # Freq: D, dtype: int64 # 15 0 # 16 1 # 17 2 # 18 3 # 19 4 # 20 5 # 21 6 # 22 7 # 23 8 # 24 9 # dtype: int64 # 实现两个list中元素相乘后再求和 # 注意这里map作用于list时,是将list中的元素一个一个的传过去 a = [1,2,3,4] b = [2,3,4,5] sumab = sum(map(lambda x,y:x*y, a,b)) print(sumab)
5 Pool和ThreadPool两个模块, 一个基于进程工作, 一个基于线程工作。
import datetime as dt
import matplotlib.pyplot as plt
import dask.dataframe as dd
from multiprocessing import Pool
listdata = []
processnum = 12
user_repay = pd.read_hdf('../data/user_repay_second.h5')
for i in range(processnum):
datai = fenpei(user_repay, i, processnum)
# print(datai['index'].nunique())# 以index分箱
listdata.append([i, datai])
del datai
time1 = dt.datetime.now()
with Pool(processnum) as p:
p.map(tfun, listdata)
print((dt.datetime.now() - time1).total_seconds())
del listdata
import time
from datetime import datetime
from multiprocessing.dummy import Pool as ThreadPool
from functools import partial
def add(x, y):
print(datetime.now(), "enter add func...")
print(datetime.now(), "leave add func...")
return x+y
def add_wrap(args):
return add(*args)
if __name__ == "__main__":
pool = ThreadPool(4) # 池的大小为4
print(pool.map(add_wrap, [(1,2),(3,4),(5,6)]))
#close the pool and wait for the worker to exit