In [ ]:
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings('ignore')
import pandas as pd
# === 2.1 Association rules from frequent itemsets ===
url = "https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/online_retail.csv"
df = pd.read_csv(url)
df.shape
In [ ]:
# Remove rows where InvoiceNo starts with 'C' (cancelled purchases)
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
df.shape
In [ ]:
### OPTION 1 ###
invoices = {}
for _, row in df.iterrows():
invoice_no = row['InvoiceNo']
description = row['Description']
if invoice_no not in invoices:
invoices[invoice_no] = set()
invoices[invoice_no].add(description)
print(len(invoices))
In [ ]:
from collections import defaultdict
# Create a defaultdict where each key maps to an empty set by default
invoices = defaultdict(set)
# Loop over the DataFrame rows
for _, row in df.iterrows():
invoices[row['InvoiceNo']].add(row['Description'])
print(len(invoices))
In [ ]:
invoices = (
df.groupby('InvoiceNo')['Description'].apply(lambda x: set(x.dropna()))
.to_dict()
)
print(len(invoices))
In [ ]:
invoices
In [ ]:
# Colletion of unique item
all_items_set = set() #duplicates are discarded
for items in invoices.values(): #iterate over the dictionary
all_items_set.update(items) #insert the element on the set
len(all_items_set)
all_items = sorted(list(all_items_set)) #conversion to a list and order them
In [ ]:
len(all_items)
In [ ]:
## OPTION 1 EXTENDED ##
import numpy as np
invoice_sets = list(invoices.values())
# Step 2: create an empty list to hold each invoice's row
presence_rows = []
# Step 3: loop through each invoice
for invoice in invoice_sets:
# for each invoice, create a list to hold the 0/1 indicators
row = []
# Step 4: loop through each item in all_items
for item in all_items:
# if the item is in the current invoice, append 1, else 0
if item in invoice:
row.append(1)
else:
row.append(0)
# Step 5: after building the row, append it to the list of rows
presence_rows.append(row)
# Step 6: convert the list of lists into a NumPy array
presence_matrix = np.array(presence_rows, dtype=int)
# Step 7: display the matrix
print("Presence matrix:\n", presence_matrix)
In [ ]:
## OPTION 1 ##
presence_matrix = np.array(
[[int(item in invoice) for item in all_items] for invoice in invoice_sets],
dtype=int
)
presence_matrix
In [ ]:
### OPTION 2 EXTENDED ###
# Step 2: map each item to its column index
item_index = {}
for i, item in enumerate(all_items):
item_index[item] = i
print("Item index mapping:", item_index)
# Step 3: initialize an empty matrix of zeros
n_invoices = len(invoices)
n_items = len(all_items)
presence_matrix = np.zeros((n_invoices, n_items), dtype=int)
print("\nInitial matrix of zeros:\n", presence_matrix)
# Step 4: fill in the matrix
for i, (invoice_name, invoice_items) in enumerate(invoices.items()):
print(f"\nProcessing {invoice_name} with items:", invoice_items)
# loop through all items in this invoice
for item in invoice_items:
if item in item_index:
j = item_index[item] # column index for this item
presence_matrix[i, j] = 1 # mark presence as 1
print(f" -> Marking presence_matrix[{i}, {j}] = 1 for item '{item}'")
# Step 5: display the completed matrix
print("\nFinal presence matrix:\n", presence_matrix)
In [ ]:
# Map each item to its column index
item_index = {item: i for i, item in enumerate(all_items)} # which column corresponds to each product
# Initialize an empty 0/1 matrix
presence_matrix = np.zeros((len(invoices), len(all_items)), dtype=int) # create a numpy array of zeros with N(invoices) rows and M(products) columns
# Fill it in
for i, invoice in enumerate(invoices.values()): #iterate over each invoice's set of items keeping track of the invoioce's row index i
cols = [item_index[item] for item in invoice if item in item_index]
presence_matrix[i, cols] = 1
presence_matrix
In [ ]:
df = pd.DataFrame(data=presence_matrix, columns=all_items)
df
In [ ]:
import warnings
warnings.filterwarnings('ignore')
from mlxtend.frequent_patterns import fpgrowth
list_of_minsup = [0.5, 0.1, 0.05, 0.02, 0.01]
for msp in list_of_minsup:
fi = fpgrowth(df, msp)
print(f"{msp} -> Number of elements:", len(fi))
In [ ]:
fpgrowth(df, 0.02)
In [ ]:
freq_itemsets = fpgrowth(df, 0.02)
freq_itemsets[freq_itemsets["itemsets"].map(len) > 1]
In [ ]:
M = df.values # matrix from the df dataframe
col_a, col_b = 1857, 1855 #select the columns
support_a = M[:, col_a].mean() #mean works because True = 1 and False = 0, so it gives directly the proportions
support_b = M[:, col_b].mean()
support_both = np.mean((M[:, col_a] == 1) & (M[:, col_b] == 1))
confidence_a_to_b = support_both / support_a
confidence_b_to_a = support_both / support_b
print(f"Confidence {col_a} => {col_b}: {confidence_a_to_b:.3f}")
print(f"Confidence {col_b} => {col_a}: {confidence_b_to_a:.3f}")
In [ ]:
from mlxtend.frequent_patterns import association_rules
# 1) Ensure your input df to fpgrowth is one-hot boolean (no ints)
# If it's 0/1 ints, convert to bool:
df_bool = df.astype(bool)
# 2) Mine frequent itemsets (use column names in the itemsets)
fi = fpgrowth(df_bool, min_support=0.01, use_colnames=True)
# 3) Generate rules, filtering by confidence >= 0.85
rules = association_rules(fi, metric='confidence', min_threshold=0.85)
rules.head()
In [ ]:
# === 2.2 Apriori implementation ===
transactions = [
["a","b"],
["b","c","d"],
["a","c","d","e"],
["a","d","e"],
["a","b","c"],
["a","b","c","d"],
["b","c"],
["a","b","c"],
["a","b","d"],
["b","c","e"]
]
In [ ]:
from collections import defaultdict # automatically assigns default to element that does not exists
freq = defaultdict(lambda: 0)
minsup = 1
for t in transactions:
for el in t:
freq[el] += 1
dict(freq)
In [ ]:
### CREATION OF L EXTENDED ###
L = []
# Step 1: Create an empty dictionary to store the frequent itemsets of this level
new_level = {}
# Step 2: Loop through all itemsets and their frequencies in 'freq'
for k, v in freq.items():
print(f"Checking itemset {k} with frequency {v}")
# Step 3: Keep only those itemsets whose frequency meets or exceeds the minimum support
if v >= minsup:
new_level[(k,)] = v # store as a tuple key (e.g., ('apple',)) to keep consistent structure
print(f" -> Itemset {k} is frequent (v >= minsup). Added to new level.")
else:
print(f" -> Itemset {k} is infrequent (v < minsup). Discarded.")
# Step 4: Append this new dictionary of frequent itemsets to the main list L
L.append(new_level)
# Step 5: Print confirmation
print("\nNew level of frequent itemsets added to L:")
for itemset, count in new_level.items():
print(f" {itemset}: {count}")
In [ ]:
def generate_candidates(L):
"""
Generate candidate itemsets (C_{k+1}) from a list of frequent itemsets (L_k).
Each element of L is assumed to be a tuple of items (sorted).
The function performs a self-join: it combines pairs of itemsets
that share the same prefix (all but the last item) to form larger candidates.
"""
# Step 1: initialize the list that will store new candidate itemsets
C = []
print("Step 1 — Starting candidate generation")
print("Input frequent itemsets (L):")
for itemset in L:
print(" ", itemset)
# Step 2: iterate through all unique pairs of itemsets in L
print("\nStep 2 — Comparing pairs of itemsets:")
for i in range(len(L)):
for j in range(i + 1, len(L)):
print(f"\nComparing L[{i}] = {L[i]} and L[{j}] = {L[j]}")
# Step 3: check if their prefixes (all items except the last one) are identical
if L[i][:-1] == L[j][:-1]:
print(" -> Prefixes match:", L[i][:-1])
# Step 4: combine them into a new candidate
# We keep the shared prefix and append the two different last items, sorted
new_candidate = L[i][:-1] + tuple(sorted([L[i][-1], L[j][-1]]))
# Step 5: add this candidate to the list
C.append(new_candidate)
print(" -> Generated new candidate:", new_candidate)
else:
print(" -> Prefixes differ, no candidate generated.")
# Step 6: print and return the complete candidate list
print("\nStep 6 — Final candidate list:")
for candidate in C:
print(" ", candidate)
return C
In [ ]:
def generate_candidates(L):
C = []
for i in range(len(L)):
for j in range(i+1, len(L)):
# iterate over all possible pairs in L
if L[i][:-1] == L[j][:-1]:
# L[i] and L[j] have matching prefixes (they only differ by the last item),
# so we generate the new element by using the same prefix and appending the two suffixes
C.append(L[i][:-1] + tuple(sorted([L[i][-1], L[j][-1]])))
return C
In [ ]:
# expecting ab, ac, ad, bc, bd, cd
print(generate_candidates([ ('a','b'), ('a','c'), ('b','c'), ('c','f'), ('c','d'), ('c','e')]))
In [ ]:
from itertools import combinations
def prune_values(freq, candidates):
keep = []
for candidate in candidates:
combs = list(combinations(candidate, len(candidate)-1))
if all([ comb in freq for comb in combs ]):
keep.append(candidate)
return keep
In [ ]:
from collections import defaultdict
def my_apriori(transactions, minsup):
# step 1 - generation of the frequencies
freq = defaultdict(lambda: 0)
for t in transactions:
for el in t:
freq[el] += 1
L = []
L.append({ (k,): v for k,v in freq.items() if v >= minsup })
while L[-1] != {}: # L[-1] is the last element
C = prune_values(L[-1], generate_candidates(list(L[-1].keys()))) # step 2
# step 3
freq = defaultdict(lambda: 0)
for t in transactions:
transaction_set = set(t)
for c in C:
if set(c).issubset(transaction_set):
freq[c] += 1
L.append({ k: v for k,v in freq.items() if v >= minsup }) # step 4
return L[:-1] # do not return last element, as we know it is {}
In [ ]:
my_apriori(transactions, 2)
In [ ]:
import requests
import json
# 1. Load JSON from URL
url = "https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/modified_coco.json"
response = requests.get(url)
images = response.json() # parse the JSON content
# 2. Build dataset: list of unique annotations per image
dataset = [list(set(image["annotations"])) for image in images]
# 3. Check how many images you have
print(f"Number of images: {len(dataset)}")
In [ ]:
L = my_apriori(dataset, 0.02 * len(dataset))
for k, L_k in enumerate(L[1:]):
print(k+2, list(L_k.keys())[:10])
In [ ]:
import pandas as pd
all_items_set = set()
for items in dataset:
all_items_set.update(items)
all_items = sorted(list(all_items_set))
presence_matrix = [ [ int(item in image) for item in all_items ] for image in dataset ]
df = pd.DataFrame(presence_matrix, columns=all_items)
In [ ]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpgrowth
fi_ap = apriori(df, 0.02)
fi_fp = fpgrowth(df, 0.02)
fi_my_ap = my_apriori(dataset, 0.02*5000)
In [ ]:
tuples_ap = { tuple(row) for row in fi_ap.values }
tuples_fp = { tuple(row) for row in fi_fp.values}
tuples_ap == tuples_fp
In [ ]:
fi_myap = set()
for L_k in L:
# - v/5000 converts the support from absolute to relative
# - frozenset() builds an immutable set
# - { all_items.index(k_) for k_ in k } builds a set where
# the elements are the indices (in the list all_items)
# of the elements in k
fi_myap.update({ (v/5000, frozenset({all_items.index(k_) for k_ in k})) for k,v in L_k.items() })
In [ ]:
tuples_ap == fi_myap
In [ ]:
import warnings
warnings.filterwarnings('ignore')
from timeit import timeit
print("fpgrowth", timeit(lambda: fpgrowth(df, 0.005), number=1))
print("apriori", timeit(lambda: apriori(df, 0.005), number=1))
print("my_apriori", timeit(lambda: my_apriori(dataset, 0.005 * 5000), number=1))