1. 读取数据
data= open('e:/java_ws/scalademo/data/sample_naive_bayes_data.txt' , 'r')
2. 把数据随机分割为training集 和test集
def SplitData(data,max,ind,seed): ## seed is always be 11L test=[] train=[] random.seed(seed) for line in data: if random.randint(0,max)==ind: ## if a random int between 0 and max is ind, then put this line in test Set test.append(''.join(line)) else: train.append(''.join(line)) return train,test
3. 按分割符拆分一个数据集
def parseData(data,delimiter1,delimiter2): # delimiter1 是拆开X,Y; delimiter2 是拆开Y的每个元素 x=[] y=[] for line in data: parts = line.split(delimiter1) x1 = [float(a) for a in parts[1].split(delimiter2)] y1 = float(parts[0]) ##print x1,y1 x.append(x1) y.append(y1) return x,y