啤酒尿布的案例是引发大数据思考的一个非常重要的案例,自从这个案例出现之后,对其进行深度研究的人员不计其数,本文是基于网站页面,对这一案例进行借鉴引用,将用户访问的页面看成是对应的产品,通过频繁项集关联规则来分析访问某一页面的客户还会访问哪些其他的案例,进而分析用户访问页面之间的关联规则。
在进行页面关联规则的访问的过程中,比较大的问题即为数据量的问题,用户行为数据中可以获取到用户对页面的访问信息,但是由于这一信息量较大,在应用这一案例的过程中往往需要进行分布式处理,基于使用的工具限制,此处基于python分批文件读取来模拟分布式过程。
具体代码如下:
1、Apriori算法:
#coding=utf-8
# 个人公众号:livandata
import sys
def apriori(D, minSup):
C1 = {}
for T in D:
for I in T:
if I in C1:
C1[I] += 1
else:
C1[I] = 1
print(C1)
_keys1 = C1.keys()
keys1 = []
for i in _keys1:
keys1.append([i])
n = len(D)
cutKeys1 = []
for k in keys1[:]:
if C1[k[0]]*1.0/n >= minSup:
cutKeys1.append(k)
cutKeys1.sort()
keys = cutKeys1
all_keys = []
all_C = []
while keys!= []:
C = getC(D, keys)
cutKeys, curC = getCutKeys(keys, C, minSup, len(D))
for key in cutKeys:
all_keys.append(key)
for c in curC:
all_C.append(c)
keys = aproiri_gen(cutKeys)
return all_keys, all_C
def getC(D, keys):
'''对keys中的每一个key进行计数'''
C = []
for key in keys:
c = 0
for T in D:
have = True
for k in key:
if k not in T:
have = False
if have:
c += 1
C.append(c)
return C
def getCutKeys(keys, C, minSup, length):
'''剪枝步'''
keyss = []
Cs = []
for i, key in enumerate(keys):
if float(C[i]) / length >= minSup:
keyss.append(key)
Cs.append(C[i])
return keyss, Cs
def keyInT(key, T):
'''判断项key是否在数据库中某一元组T中'''
for k in key:
if k not in T: # 只要有一个不匹配,就返回False
return False
return True
def aproiri_gen(keys1):
'''连接步'''
keys2 = []
for k1 in keys1:
for k2 in keys1:
if k1 != k2:
key = []
for k in k1:
if k not in key:
key.append(k)
for k in k2:
if k not in key:
key.append(k)
key.sort()
if key not in keys2:
keys2.append(key)
return keys2
2、FP_tree算法:
# encoding: utf-8
# 个人公众号:livandata
from collections import defaultdict, namedtuple
# original author information, this verison is updated by lina.
__license__ = 'MIT License'
def find_frequent_itemsets(transactions, minimum_support, include_support=False):
"""
Find frequent itemsets in the given transactions using FP-growth. This
function returns a generator instead of an eagerly-populated list of items.
The `transactions` parameter can be any iterable of iterables of items.
`minimum_support` should be an integer specifying the minimum number of
occurrences of an itemset for it to be accepted.
Each item must be hashable (i.e., it must be valid as a member of a
dictionary or a set).
If `include_support` is true, yield (itemset, support) pairs instead of
just the itemsets.
"""
items = defaultdict(lambda: 0) # mapping from items to their supports
# Load the passed-in transactions and count the support that individual
# items have.
for transaction in transactions:
for item in transaction:
items[item] += 1
# Remove infrequent items from the item support dictionary.
items = dict((item, support) for item, support in items.items()
if support >= minimum_support)
# Build our FP-tree. Before any transactions can be added to the tree, they
# must be stripped of infrequent items and their surviving items must be
# sorted in decreasing order of frequency.
def clean_transaction(transaction):
transaction = filter(lambda v: v in items, transaction)
transaction_list = list(transaction) # 为了防止变量在其他部分调用,这里引入临时变量transaction_list
transaction_list.sort(key=lambda v: items[v], reverse=True)
return transaction_list
master = FPTree()
for transaction in map(clean_transaction, transactions):
master.add(transaction)
def find_with_suffix(tree, suffix):
for item, nodes in tree.items():
support = sum(n.count for n in nodes)
if support >= minimum_support and item not in suffix:
# New winner!
found_set = [item] + suffix
yield (found_set, support) if include_support else found_set
# Build a conditional tree and recursively search for frequent
# itemsets within it.
cond_tree = conditional_tree_from_paths(tree.prefix_paths(item))
for s in find_with_suffix(cond_tree, found_set):
yield s # pass along the good news to our caller
# Search for frequent itemsets, and yield the results we find.
for itemset in find_with_suffix(master, []):
yield itemset
class FPTree(object):
"""
An FP tree.
This object may only store transaction items that are hashable
(i.e., all items must be valid as dictionary keys or set members).
"""
Route = namedtuple('Route', 'head tail')
def __init__(self):
# The root node of the tree.
self._root = FPNode(self, None, None)
# A dictionary mapping items to the head and tail of a path of
# "neighbors" that will hit every node containing that item.
self._routes = {}
@property
def root(self):
"""The root node of the tree."""
return self._root
def add(self, transaction):
"""Add a transaction to the tree."""
point = self._root
for item in transaction:
next_point = point.search(item)
if next_point:
# There is already a node in this tree for the current
# transaction item; reuse it.
next_point.increment()
else:
# Create a new point and add it as a child of the point we're
# currently looking at.
next_point = FPNode(self, item)
point.add(next_point)
# Update the route of nodes that contain this item to include
# our new node.
self._update_route(next_point)
point = next_point
def _update_route(self, point):
"""Add the given node to the route through all nodes for its item."""
assert self is point.tree
try:
route = self._routes[point.item]
route[1].neighbor = point # route[1] is the tail
self._routes[point.item] = self.Route(route[0], point)
except KeyError:
# First node for this item; start a new route.
self._routes[point.item] = self.Route(point, point)
def items(self):
"""
Generate one 2-tuples for each item represented in the tree. The first
element of the tuple is the item itself, and the second element is a
generator that will yield the nodes in the tree that belong to the item.
"""
for item in self._routes.keys():
yield (item, self.nodes(item))
def nodes(self, item):
"""
Generate the sequence of nodes that contain the given item.
"""
try:
node = self._routes[item][0]
except KeyError:
return
while node:
yield node
node = node.neighbor
def prefix_paths(self, item):
"""Generate the prefix paths that end with the given item."""
def collect_path(node):
path = []
while node and not node.root:
path.append(node)
node = node.parent
path.reverse()
return path
return (collect_path(node) for node in self.nodes(item))
def inspect(self):
print('Tree:')
self.root.inspect(1)
print
print('Routes:')
for item, nodes in self.items():
print(' %r' % item)
for node in nodes:
print(' %r' % node)
def conditional_tree_from_paths(paths):
"""Build a conditional FP-tree from the given prefix paths."""
tree = FPTree()
condition_item = None
items = set()
# Import the nodes in the paths into the new tree. Only the counts of the
# leaf notes matter; the remaining counts will be reconstructed from the
# leaf counts.
for path in paths:
if condition_item is None:
condition_item = path[-1].item
point = tree.root
for node in path:
next_point = point.search(node.item)
if not next_point:
# Add a new node to the tree.
items.add(node.item)
count = node.count if node.item == condition_item else 0
next_point = FPNode(tree, node.item, count)
point.add(next_point)
tree._update_route(next_point)
point = next_point
assert condition_item is not None
# Calculate the counts of the non-leaf nodes.
for path in tree.prefix_paths(condition_item):
count = path[-1].count
for node in reversed(path[:-1]):
node._count += count
return tree
class FPNode(object):
"""A node in an FP tree."""
def __init__(self, tree, item, count=1):
self._tree = tree
self._item = item
self._count = count
self._parent = None
self._children = {}
self._neighbor = None
def add(self, child):
"""Add the given FPNode `child` as a child of this node."""
if not isinstance(child, FPNode):
raise TypeError("Can only add other FPNodes as children")
if not child.item in self._children:
self._children[child.item] = child
child.parent = self
def search(self, item):
"""
Check whether this node contains a child node for the given item.
If so, that node is returned; otherwise, `None` is returned.
"""
try:
return self._children[item]
except KeyError:
return None
def __contains__(self, item):
return item in self._children
@property
def tree(self):
"""The tree in which this node appears."""
return self._tree
@property
def item(self):
"""The item contained in this node."""
return self._item
@property
def count(self):
"""The count associated with this node's item."""
return self._count
def increment(self):
"""Increment the count associated with this node's item."""
if self._count is None:
raise ValueError("Root nodes have no associated count.")
self._count += 1
@property
def root(self):
"""True if this node is the root of a tree; false if otherwise."""
return self._item is None and self._count is None
@property
def leaf(self):
"""True if this node is a leaf in the tree; false if otherwise."""
return len(self._children) == 0
@property
def parent(self):
"""The node's parent"""
return self._parent
@parent.setter
def parent(self, value):
if value is not None and not isinstance(value, FPNode):
raise TypeError("A node must have an FPNode as a parent.")
if value and value.tree is not self.tree:
raise ValueError("Cannot have a parent from another tree.")
self._parent = value
@property
def neighbor(self):
"""
The node's neighbor; the one with the same value that is "to the right"
of it in the tree.
"""
return self._neighbor
@neighbor.setter
def neighbor(self, value):
if value is not None and not isinstance(value, FPNode):
raise TypeError("A node must have an FPNode as a neighbor.")
if value and value.tree is not self.tree:
raise ValueError("Cannot have a neighbor from another tree.")
self._neighbor = value
@property
def children(self):
"""The nodes that are children of this node."""
return tuple(self._children.itervalues())
def inspect(self, depth=0):
print((' ' * depth) + repr(self))
for child in self.children:
child.inspect(depth + 1)
def __repr__(self):
if self.root:
return "<%s (root)>" % type(self).__name__
return "<%s %r (%r)>" % (type(self).__name__, self.item, self.count)
if __name__ == '__main__':
from optparse import OptionParser
import csv
p = OptionParser(usage='%prog data_file')
p.add_option('-s', '--minimum-support', dest='minsup', type='int',
help='Minimum itemset support (default: 2)')
p.add_option('-n', '--numeric', dest='numeric', action='store_true',
help='Convert the values in datasets to numerals (default: false)')
p.set_defaults(minsup=2)
p.set_defaults(numeric=False)
options, args = p.parse_args()
if len(args) < 1:
p.error('must provide the path to a CSV file to read')
transactions = []
with open(args[0]) as database:
for row in csv.reader(database):
if options.numeric:
transaction = []
for item in row:
transaction.append(long(item))
transactions.append(transaction)
else:
transactions.append(row)
result = []
for itemset, support in find_frequent_itemsets(transactions, options.minsup, True):
result.append((itemset, support))
result = sorted(result, key=lambda i: i[0])
for itemset, support in result:
print(str(itemset) + ' ' + str(support))
以上两个算法是从网上找到的,可以作为我们这次数据挖掘的基础算法。
3、data_analysis文件,主要是对数据进行一些基本的分析,将一些分类不在一个级别 上的数据进行规整,将一些不容易区分页面信息的数据进行转换。
#!/usr/bin/env python
# _*_ UTF-8 _*_
# 个人公众号:livandata
import re
def open_big_data(path):
with open(path) as f:
for i in f:
yield i
def data_check(sess_data):
with open('pingan_pro', 'r') as f:
data_c = f.read()
check_data = data_c.split(',
')
for i in range(len(sess_data)):
for j in range(len(sess_data[i])):
if(sess_data[i][j]=='今日步数' or sess_data[i][j] == '免费领月卡' or sess_data[i][j] == '健康服务'):
sess_data[i][j] = '我的健康'
if (sess_data[i][j] == '购房贷' or sess_data[i][j] == '买家私' or sess_data[i][j] == '装修超预算'):
sess_data[i][j] = '房屋贷款'
if ((re.search('消息中心', sess_data[i][j]) != None)):
sess_data[i][j] = '消息中心'
if ((re.search('信用卡', sess_data[i][j]) != None)
or (re.search('信用额度', sess_data[i][j]) != None)
or (re.search('临额调整', sess_data[i][j]) != None)
or (re.search('我的额度', sess_data[i][j]) != None)
or (re.search('额度评估', sess_data[i][j]) != None)
or (re.search('还款', sess_data[i][j]) != None)):
sess_data[i][j] = '信用卡'
if ((re.search('二维码', sess_data[i][j]) != None)
or (re.search('支付记录', sess_data[i][j]) != None)):
sess_data[i][j] = '收付款'
if ((re.search('通讯录', sess_data[i][j]) != None)):
sess_data[i][j] = '通讯录'
if ((re.search('http:', sess_data[i][j]) != None)
or (re.search('结束页', sess_data[i][j]) != None)
or (re.search('首页', sess_data[i][j]) != None)
or (re.search('购买', sess_data[i][j]) != None)
or (re.search('申请记录', sess_data[i][j]) != None)
or (re.search('交易详情页', sess_data[i][j]) != None)):
sess_data[i][j] = 'nan'
for t in range(len(check_data)):
if(re.search(check_data[t], sess_data[i][j])!=None):
sess_data[i][j] = check_data[t]
for i in range(len(sess_data)):
page_tmp = list(set(sess_data[i]))
sess_data[i] = [i for i in page_tmp if i!='nan']
return sess_data
4、refresh_data文件,主要是对分析的文件进行存储,通过文件存取的方式实现分布式处理:
#!/usr/bin/env python
# _*_ UTF-8 _*_
# 个人公众号:livandata
import os
def write_result(items):
with open('data_result.txt', 'a+') as f:
for it in items:
f.write(str(it)+':'+str(items[it])+'
')
def read_result(items):
data_res = {}
data_res_2 = {}
with open('data_result.txt', 'r+') as f:
for data_tmp in f:
datas_tmp = data_tmp.split('
')
datas = datas_tmp[0].split(':')
for it in items:
if(datas[0] == it):
datas_val = str(int(datas[1])+int(items[it]))
data_res[it] = datas_val
data_res_2[data_tmp] = dat+'
'
datass_res_list = [i for i in data_res]
items_list = [j for j in items]
res = list(set(items_list).difference(set(datass_res_list)))
datass_={}
for i in res:
if(i in list(items.keys())):
datass_[i] = items[i]
return data_res_2, datass_
def refresh_data(items):
if(os.path.exists('data_result.txt')):
datas_res, datas_new = read_result(items)
print(datas_res)
with open('data_result.txt', 'a+') as f:
for i in datas_new:
f.write(i+':'+str(datas_new[i])+'
')
datas_res_li = [i for i in datas_res]
with open('data_result.txt', 'r+') as f:
for j in f:
if(j not in datas_res_li):
with open('data_result2.txt', 'a+') as f2:
f2.write(j)
else:
with open('data_result2.txt', 'a+') as f2:
f2.write(datas_res[j])
os.remove('data_result.txt')
os.rename('data_result2.txt', 'data_result.txt')
else:
write_result(items)
5、pro文件:即将一些页面进行转换所需要的材料库:
96搜索,
借钱,
口袋社区,
领券中心,
猜金价,
种摇钱树,
车主贷,
宅易通,
6、run文件,主要是运行文件的过程:
#!/usr/bin/env python
# _*_ UTF-8 _*_
# 个人公众号:livandata
import pandas as pd
import data_analysis as das
import Fp_growth as fpg
import refresh_data as rfd
path='..datasub_customer.csv'
loop = True
chunkSize = 10
chunks = []
reader = pd.read_csv(path, iterator=True, dtype=str)
while loop:
try:
chunk = reader.get_chunk(chunkSize).fillna('nan')
data = chunk[chunk['page_name']!='nan']['page_name'].reset_index(http://www.my516.com)
page_names = []
for i in range(len(data['page_name'])):
names = data['page_name'][i].split('"')
page_name = [j for j in names if(j!='[' and j!=']' and j!=',')]
page_names.append(page_name)
page_names = das.data_check(page_name)
page_names = [i for i in page_names if i!=[]]
frequent_itemsets = fpg.find_frequent_itemsets(page_names, minimum_support=1,
include_support=True)
result = []
for itemset, support in frequent_itemsets:
result.append(itemset, support)
items = {}
n = 5
minSup = 0.6
for itemset, support in result:
keys = str(itemset)
values = str(support)
if(float(values)/n >= minSup):
items[keys] = values
rfd.refresh_data(items)
except StopIteration:
loop = False
print('Iteration is stopped')
以上是算法运行的全过程,融合了伪分布式处理,主要是参考了hadoop的处理方式。
---------------------