zoukankan      html  css  js  c++  java
  • 实战演习(十二)——参照关联规则频繁项集来分析用户访问页面之间的关联性

    啤酒尿布的案例是引发大数据思考的一个非常重要的案例,自从这个案例出现之后,对其进行深度研究的人员不计其数,本文是基于网站页面,对这一案例进行借鉴引用,将用户访问的页面看成是对应的产品,通过频繁项集关联规则来分析访问某一页面的客户还会访问哪些其他的案例,进而分析用户访问页面之间的关联规则。

    在进行页面关联规则的访问的过程中,比较大的问题即为数据量的问题,用户行为数据中可以获取到用户对页面的访问信息,但是由于这一信息量较大,在应用这一案例的过程中往往需要进行分布式处理,基于使用的工具限制,此处基于python分批文件读取来模拟分布式过程。

    具体代码如下:

    1、Apriori算法:

    #coding=utf-8
    # 个人公众号:livandata
    import sys
    def apriori(D, minSup):
    C1 = {}
    for T in D:
    for I in T:
    if I in C1:
    C1[I] += 1
    else:
    C1[I] = 1
    print(C1)
    _keys1 = C1.keys()
    keys1 = []
    for i in _keys1:
    keys1.append([i])
    n = len(D)
    cutKeys1 = []
    for k in keys1[:]:
    if C1[k[0]]*1.0/n >= minSup:
    cutKeys1.append(k)
    cutKeys1.sort()
    keys = cutKeys1
    all_keys = []
    all_C = []
    while keys!= []:
    C = getC(D, keys)
    cutKeys, curC = getCutKeys(keys, C, minSup, len(D))
    for key in cutKeys:
    all_keys.append(key)
    for c in curC:
    all_C.append(c)
    keys = aproiri_gen(cutKeys)
    return all_keys, all_C

    def getC(D, keys):
    '''对keys中的每一个key进行计数'''
    C = []
    for key in keys:
    c = 0
    for T in D:
    have = True
    for k in key:
    if k not in T:
    have = False
    if have:
    c += 1
    C.append(c)
    return C

    def getCutKeys(keys, C, minSup, length):
    '''剪枝步'''
    keyss = []
    Cs = []
    for i, key in enumerate(keys):
    if float(C[i]) / length >= minSup:
    keyss.append(key)
    Cs.append(C[i])
    return keyss, Cs

    def keyInT(key, T):
    '''判断项key是否在数据库中某一元组T中'''
    for k in key:
    if k not in T: # 只要有一个不匹配,就返回False
    return False
    return True

    def aproiri_gen(keys1):
    '''连接步'''
    keys2 = []
    for k1 in keys1:
    for k2 in keys1:
    if k1 != k2:
    key = []
    for k in k1:
    if k not in key:
    key.append(k)
    for k in k2:
    if k not in key:
    key.append(k)
    key.sort()
    if key not in keys2:
    keys2.append(key)

    return keys2
    2、FP_tree算法:

    # encoding: utf-8
    # 个人公众号:livandata
    from collections import defaultdict, namedtuple

    # original author information, this verison is updated by lina.
    __license__ = 'MIT License'
    def find_frequent_itemsets(transactions, minimum_support, include_support=False):
    """
    Find frequent itemsets in the given transactions using FP-growth. This
    function returns a generator instead of an eagerly-populated list of items.

    The `transactions` parameter can be any iterable of iterables of items.
    `minimum_support` should be an integer specifying the minimum number of
    occurrences of an itemset for it to be accepted.

    Each item must be hashable (i.e., it must be valid as a member of a
    dictionary or a set).

    If `include_support` is true, yield (itemset, support) pairs instead of
    just the itemsets.
    """
    items = defaultdict(lambda: 0) # mapping from items to their supports

    # Load the passed-in transactions and count the support that individual
    # items have.
    for transaction in transactions:
    for item in transaction:
    items[item] += 1

    # Remove infrequent items from the item support dictionary.
    items = dict((item, support) for item, support in items.items()
    if support >= minimum_support)

    # Build our FP-tree. Before any transactions can be added to the tree, they
    # must be stripped of infrequent items and their surviving items must be
    # sorted in decreasing order of frequency.
    def clean_transaction(transaction):
    transaction = filter(lambda v: v in items, transaction)
    transaction_list = list(transaction) # 为了防止变量在其他部分调用,这里引入临时变量transaction_list
    transaction_list.sort(key=lambda v: items[v], reverse=True)
    return transaction_list

    master = FPTree()
    for transaction in map(clean_transaction, transactions):
    master.add(transaction)

    def find_with_suffix(tree, suffix):
    for item, nodes in tree.items():
    support = sum(n.count for n in nodes)
    if support >= minimum_support and item not in suffix:
    # New winner!
    found_set = [item] + suffix
    yield (found_set, support) if include_support else found_set

    # Build a conditional tree and recursively search for frequent
    # itemsets within it.
    cond_tree = conditional_tree_from_paths(tree.prefix_paths(item))
    for s in find_with_suffix(cond_tree, found_set):
    yield s # pass along the good news to our caller

    # Search for frequent itemsets, and yield the results we find.
    for itemset in find_with_suffix(master, []):
    yield itemset

    class FPTree(object):
    """
    An FP tree.

    This object may only store transaction items that are hashable
    (i.e., all items must be valid as dictionary keys or set members).
    """

    Route = namedtuple('Route', 'head tail')

    def __init__(self):
    # The root node of the tree.
    self._root = FPNode(self, None, None)

    # A dictionary mapping items to the head and tail of a path of
    # "neighbors" that will hit every node containing that item.
    self._routes = {}

    @property
    def root(self):
    """The root node of the tree."""
    return self._root

    def add(self, transaction):
    """Add a transaction to the tree."""
    point = self._root

    for item in transaction:
    next_point = point.search(item)
    if next_point:
    # There is already a node in this tree for the current
    # transaction item; reuse it.
    next_point.increment()
    else:
    # Create a new point and add it as a child of the point we're
    # currently looking at.
    next_point = FPNode(self, item)
    point.add(next_point)

    # Update the route of nodes that contain this item to include
    # our new node.
    self._update_route(next_point)

    point = next_point

    def _update_route(self, point):
    """Add the given node to the route through all nodes for its item."""
    assert self is point.tree

    try:
    route = self._routes[point.item]
    route[1].neighbor = point # route[1] is the tail
    self._routes[point.item] = self.Route(route[0], point)
    except KeyError:
    # First node for this item; start a new route.
    self._routes[point.item] = self.Route(point, point)

    def items(self):
    """
    Generate one 2-tuples for each item represented in the tree. The first
    element of the tuple is the item itself, and the second element is a
    generator that will yield the nodes in the tree that belong to the item.
    """
    for item in self._routes.keys():
    yield (item, self.nodes(item))

    def nodes(self, item):
    """
    Generate the sequence of nodes that contain the given item.
    """

    try:
    node = self._routes[item][0]
    except KeyError:
    return

    while node:
    yield node
    node = node.neighbor

    def prefix_paths(self, item):
    """Generate the prefix paths that end with the given item."""

    def collect_path(node):
    path = []
    while node and not node.root:
    path.append(node)
    node = node.parent
    path.reverse()
    return path

    return (collect_path(node) for node in self.nodes(item))

    def inspect(self):
    print('Tree:')
    self.root.inspect(1)

    print
    print('Routes:')
    for item, nodes in self.items():
    print(' %r' % item)
    for node in nodes:
    print(' %r' % node)

    def conditional_tree_from_paths(paths):
    """Build a conditional FP-tree from the given prefix paths."""
    tree = FPTree()
    condition_item = None
    items = set()

    # Import the nodes in the paths into the new tree. Only the counts of the
    # leaf notes matter; the remaining counts will be reconstructed from the
    # leaf counts.
    for path in paths:
    if condition_item is None:
    condition_item = path[-1].item

    point = tree.root
    for node in path:
    next_point = point.search(node.item)
    if not next_point:
    # Add a new node to the tree.
    items.add(node.item)
    count = node.count if node.item == condition_item else 0
    next_point = FPNode(tree, node.item, count)
    point.add(next_point)
    tree._update_route(next_point)
    point = next_point

    assert condition_item is not None

    # Calculate the counts of the non-leaf nodes.
    for path in tree.prefix_paths(condition_item):
    count = path[-1].count
    for node in reversed(path[:-1]):
    node._count += count

    return tree

    class FPNode(object):
    """A node in an FP tree."""

    def __init__(self, tree, item, count=1):
    self._tree = tree
    self._item = item
    self._count = count
    self._parent = None
    self._children = {}
    self._neighbor = None

    def add(self, child):
    """Add the given FPNode `child` as a child of this node."""

    if not isinstance(child, FPNode):
    raise TypeError("Can only add other FPNodes as children")

    if not child.item in self._children:
    self._children[child.item] = child
    child.parent = self

    def search(self, item):
    """
    Check whether this node contains a child node for the given item.
    If so, that node is returned; otherwise, `None` is returned.
    """
    try:
    return self._children[item]
    except KeyError:
    return None

    def __contains__(self, item):
    return item in self._children

    @property
    def tree(self):
    """The tree in which this node appears."""
    return self._tree

    @property
    def item(self):
    """The item contained in this node."""
    return self._item

    @property
    def count(self):
    """The count associated with this node's item."""
    return self._count

    def increment(self):
    """Increment the count associated with this node's item."""
    if self._count is None:
    raise ValueError("Root nodes have no associated count.")
    self._count += 1

    @property
    def root(self):
    """True if this node is the root of a tree; false if otherwise."""
    return self._item is None and self._count is None

    @property
    def leaf(self):
    """True if this node is a leaf in the tree; false if otherwise."""
    return len(self._children) == 0

    @property
    def parent(self):
    """The node's parent"""
    return self._parent

    @parent.setter
    def parent(self, value):
    if value is not None and not isinstance(value, FPNode):
    raise TypeError("A node must have an FPNode as a parent.")
    if value and value.tree is not self.tree:
    raise ValueError("Cannot have a parent from another tree.")
    self._parent = value

    @property
    def neighbor(self):
    """
    The node's neighbor; the one with the same value that is "to the right"
    of it in the tree.
    """
    return self._neighbor

    @neighbor.setter
    def neighbor(self, value):
    if value is not None and not isinstance(value, FPNode):
    raise TypeError("A node must have an FPNode as a neighbor.")
    if value and value.tree is not self.tree:
    raise ValueError("Cannot have a neighbor from another tree.")
    self._neighbor = value

    @property
    def children(self):
    """The nodes that are children of this node."""
    return tuple(self._children.itervalues())

    def inspect(self, depth=0):
    print((' ' * depth) + repr(self))
    for child in self.children:
    child.inspect(depth + 1)

    def __repr__(self):
    if self.root:
    return "<%s (root)>" % type(self).__name__
    return "<%s %r (%r)>" % (type(self).__name__, self.item, self.count)


    if __name__ == '__main__':
    from optparse import OptionParser
    import csv

    p = OptionParser(usage='%prog data_file')
    p.add_option('-s', '--minimum-support', dest='minsup', type='int',
    help='Minimum itemset support (default: 2)')
    p.add_option('-n', '--numeric', dest='numeric', action='store_true',
    help='Convert the values in datasets to numerals (default: false)')
    p.set_defaults(minsup=2)
    p.set_defaults(numeric=False)

    options, args = p.parse_args()
    if len(args) < 1:
    p.error('must provide the path to a CSV file to read')

    transactions = []
    with open(args[0]) as database:
    for row in csv.reader(database):
    if options.numeric:
    transaction = []
    for item in row:
    transaction.append(long(item))
    transactions.append(transaction)
    else:
    transactions.append(row)

    result = []
    for itemset, support in find_frequent_itemsets(transactions, options.minsup, True):
    result.append((itemset, support))

    result = sorted(result, key=lambda i: i[0])
    for itemset, support in result:
    print(str(itemset) + ' ' + str(support))
    以上两个算法是从网上找到的,可以作为我们这次数据挖掘的基础算法。

    3、data_analysis文件,主要是对数据进行一些基本的分析,将一些分类不在一个级别 上的数据进行规整,将一些不容易区分页面信息的数据进行转换。

    #!/usr/bin/env python
    # _*_ UTF-8 _*_
    # 个人公众号:livandata

    import re
    def open_big_data(path):
    with open(path) as f:
    for i in f:
    yield i

    def data_check(sess_data):
    with open('pingan_pro', 'r') as f:
    data_c = f.read()
    check_data = data_c.split(', ')
    for i in range(len(sess_data)):
    for j in range(len(sess_data[i])):
    if(sess_data[i][j]=='今日步数' or sess_data[i][j] == '免费领月卡' or sess_data[i][j] == '健康服务'):
    sess_data[i][j] = '我的健康'
    if (sess_data[i][j] == '购房贷' or sess_data[i][j] == '买家私' or sess_data[i][j] == '装修超预算'):
    sess_data[i][j] = '房屋贷款'

    if ((re.search('消息中心', sess_data[i][j]) != None)):
    sess_data[i][j] = '消息中心'

    if ((re.search('信用卡', sess_data[i][j]) != None)
    or (re.search('信用额度', sess_data[i][j]) != None)
    or (re.search('临额调整', sess_data[i][j]) != None)
    or (re.search('我的额度', sess_data[i][j]) != None)
    or (re.search('额度评估', sess_data[i][j]) != None)
    or (re.search('还款', sess_data[i][j]) != None)):
    sess_data[i][j] = '信用卡'

    if ((re.search('二维码', sess_data[i][j]) != None)
    or (re.search('支付记录', sess_data[i][j]) != None)):
    sess_data[i][j] = '收付款'

    if ((re.search('通讯录', sess_data[i][j]) != None)):
    sess_data[i][j] = '通讯录'

    if ((re.search('http:', sess_data[i][j]) != None)
    or (re.search('结束页', sess_data[i][j]) != None)
    or (re.search('首页', sess_data[i][j]) != None)
    or (re.search('购买', sess_data[i][j]) != None)
    or (re.search('申请记录', sess_data[i][j]) != None)
    or (re.search('交易详情页', sess_data[i][j]) != None)):
    sess_data[i][j] = 'nan'

    for t in range(len(check_data)):
    if(re.search(check_data[t], sess_data[i][j])!=None):
    sess_data[i][j] = check_data[t]

    for i in range(len(sess_data)):
    page_tmp = list(set(sess_data[i]))
    sess_data[i] = [i for i in page_tmp if i!='nan']

    return sess_data
    4、refresh_data文件,主要是对分析的文件进行存储,通过文件存取的方式实现分布式处理:

    #!/usr/bin/env python
    # _*_ UTF-8 _*_
    # 个人公众号:livandata

    import os
    def write_result(items):
    with open('data_result.txt', 'a+') as f:
    for it in items:
    f.write(str(it)+':'+str(items[it])+' ')

    def read_result(items):
    data_res = {}
    data_res_2 = {}
    with open('data_result.txt', 'r+') as f:
    for data_tmp in f:
    datas_tmp = data_tmp.split(' ')
    datas = datas_tmp[0].split(':')
    for it in items:
    if(datas[0] == it):
    datas_val = str(int(datas[1])+int(items[it]))
    data_res[it] = datas_val
    data_res_2[data_tmp] = dat+' '

    datass_res_list = [i for i in data_res]
    items_list = [j for j in items]
    res = list(set(items_list).difference(set(datass_res_list)))
    datass_={}
    for i in res:
    if(i in list(items.keys())):
    datass_[i] = items[i]
    return data_res_2, datass_

    def refresh_data(items):
    if(os.path.exists('data_result.txt')):
    datas_res, datas_new = read_result(items)
    print(datas_res)
    with open('data_result.txt', 'a+') as f:
    for i in datas_new:
    f.write(i+':'+str(datas_new[i])+' ')
    datas_res_li = [i for i in datas_res]
    with open('data_result.txt', 'r+') as f:
    for j in f:
    if(j not in datas_res_li):
    with open('data_result2.txt', 'a+') as f2:
    f2.write(j)
    else:
    with open('data_result2.txt', 'a+') as f2:
    f2.write(datas_res[j])
    os.remove('data_result.txt')
    os.rename('data_result2.txt', 'data_result.txt')
    else:
    write_result(items)
    5、pro文件:即将一些页面进行转换所需要的材料库:

    96搜索,
    借钱,
    口袋社区,
    领券中心,
    猜金价,
    种摇钱树,
    车主贷,
    宅易通,
    6、run文件,主要是运行文件的过程:

    #!/usr/bin/env python
    # _*_ UTF-8 _*_
    # 个人公众号:livandata

    import pandas as pd
    import data_analysis as das
    import Fp_growth as fpg
    import refresh_data as rfd

    path='..datasub_customer.csv'
    loop = True
    chunkSize = 10
    chunks = []
    reader = pd.read_csv(path, iterator=True, dtype=str)
    while loop:
    try:
    chunk = reader.get_chunk(chunkSize).fillna('nan')
    data = chunk[chunk['page_name']!='nan']['page_name'].reset_index(http://www.my516.com)
    page_names = []
    for i in range(len(data['page_name'])):
    names = data['page_name'][i].split('"')
    page_name = [j for j in names if(j!='[' and j!=']' and j!=',')]
    page_names.append(page_name)
    page_names = das.data_check(page_name)
    page_names = [i for i in page_names if i!=[]]

    frequent_itemsets = fpg.find_frequent_itemsets(page_names, minimum_support=1,
    include_support=True)
    result = []
    for itemset, support in frequent_itemsets:
    result.append(itemset, support)
    items = {}
    n = 5
    minSup = 0.6
    for itemset, support in result:
    keys = str(itemset)
    values = str(support)
    if(float(values)/n >= minSup):
    items[keys] = values

    rfd.refresh_data(items)
    except StopIteration:
    loop = False
    print('Iteration is stopped')
    以上是算法运行的全过程,融合了伪分布式处理,主要是参考了hadoop的处理方式。
    ---------------------

  • 相关阅读:
    面向对象基础小结
    异常应用场景
    集合应用场景1:迭代器
    集合应用场景2——数据结构
    华为ce交换机 Bridge-Domain NVE
    linux 内核内置模块
    linux bridge 转发 ip
    iptables nat&conntrack
    loopback
    配置集中式网关部署方式的VXLAN示例(静态方式)
  • 原文地址:https://www.cnblogs.com/hyhy904/p/11075627.html
Copyright © 2011-2022 走看看