|
Example:
|
|
Example:
|
Specifically:
|
Example:
|
|
|
|
Example: if {b,c,d} is frequent, then all subsets of {b,c,d} are also frequent
|
|
Example: if {a,b} is infrequent, then all its super sets ({a,b,c}, {a,b,d}, and {a,b,c,d}) are also infrequent:
|
|
This technique is known as support-based pruning
It was first describe in the paper: click here
It is available on Oracle: click here
|
|
|
|
k = 1;
F(1) = { i | freq(i) ≥ θN }; // 1-item sets
repeat
{
k = k + 1;
/* ----------------------------------
Candidate set generation
---------------------------------- */
C(k) = Apriori-Gen( F(k-1) ); // Generate candidate itemsets
// using only itemsets in F(k-1)
/* --------------------------------------------------
Compute support for candidate sets
-------------------------------------------------- */
for ( each candidate set c ∈ C(k) ) do
Freq(c) = 0;
for ( each transaction t ∈ T ) do
{
for ( each candidate set c ∈ C(k) ) do
{
if ( c ∈ t )
Freq(c)++;
}
}
/* ---------------------------------------
Prune Candidate set C(k)
--------------------------------------- */
F(k) = { c | c ∈ C(k) ∧ Freq(c) ≥ θN }
}
until F(k) == ∅
Frequent itemsets = F(1) ∪ F(2) ∪ ... ∪ F(k-1)
|
|
(There are actually 2 phases: the prune phase is very simple and is ignored from the discussion.)
|
|
Advantage: simple code
Advantage: large number of item sets generated (largest possible number)
F(k) = ∅;
for ( each S ∈ F(k) ) do
for ( each T ∈ F(1) ) do
add S ∪ T to F(k);
|
Properties:
|
F(k) = ∅;
for ( each S ∈ F(k) ) do
for ( each T ∈ F(k-1) ) do
if ( | S ∪ T | == k )
add S ∪ T to F(k);
|
Method 1: naive comparison
/* ----------------------
Initialize counts
---------------------- */
for ( each item set S ∈ C(k) ) do
{
Freq(S) = 0;
}
/* ----------------------------------
Count
---------------------------------- */
for ( each transaction t ) do
{
for ( each k-item set S ∈ C(k) ) do
{
if ( S ⊆ t )
{
Freq(S)++;
}
}
}
|
Transactions: t1 t2 ... t ....
|
+----------------------+
|
Candidate sets C(k): C1 C2 C3 ... Cn |
^ ^ ^ ^ |
| | | | |
+---+---+--------+-----------+ t ⊆ Ci ?
traverse all sets in C(k)
|
Properties:
|
Method 2: lookup counting
/* -------------------------------
Initialize counters
------------------------------- */
for ( each item set S ∈ C(k) ) do
{
Freq(S) = 0;
}
for ( each transaction t ) do
{
for ( each k-subset T of t ) do
{
Lookup T in C(k);
if ( found )
{
Freq(T)++;
}
}
}
|
Transactions: t1 t2 ... t ....
|
+---------------------------+
| k-item sets
+--+--+--+--+--+--+
| | | | | | |
v v v v v v v
T1 T2 T3 T4 T5 T6 T7
|
Candidate sets C(k): C1 C2 C3 ... Cn |
^ |
| |
+--------------------------+
"lookup"
|
There are 2 unspecified parts in the algorithm:
|
|
Sample implementation:
/* -----------------------------------------------
gen(head, a, k): generate k-item strings
head = prefix of the string
a = remaining characters to choose
to complete the string
k = number of characters to add
----------------------------------------------- */
void gen(char *head, char *a, int k)
{
char myHead[10];
char *c, *e;
/* ------------------------------------
Check if we need to add characters
------------------------------------ */
if ( k == 0 )
{
printf(">> %s\n", head);
return; // Done
}
/* ----------------------------------------------------
Copy prefix into local variable to enable recursion
---------------------------------------------------- */
strcpy(myHead, head);
for ( e = myHead; *e != '\0'; e++ );
*(e+1) = '\0';
/* -----------------------------------------
Add one character to the prefix string
----------------------------------------- */
for ( c = a; *c != '\0'; c++ )
{
*e = *c; // Add next character to prefix
gen(myHead, c+1, k-1); // Add remaning characters
}
}
|
/* -------------------------------
Initialize counters
------------------------------- */
for ( each item set S ∈ C(k) ) do
{
Freq(S) = 0;
}
for ( each transaction t ) do
{
for ( each k-subset T of t ) do
{
Lookup T in C(k); *********
if ( found )
{
Freq(T)++;
}
}
}
|
How can we speedup the look up process to find the counter for C(k) ???
|
|
Organization:
|
|