In [ ]:
import warnings
import pandas as pd 
import numpy as np 

warnings.filterwarnings('ignore')

import pandas as pd

# === 2.1 Association rules from frequent itemsets ===

url = "https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/online_retail.csv"
df = pd.read_csv(url)
df.shape
In [ ]:
# Remove rows where InvoiceNo starts with 'C' (cancelled purchases)
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
df.shape
In [ ]:
### OPTION 1 ### 
invoices = {}

for _, row in df.iterrows(): 
    invoice_no = row['InvoiceNo']
    description = row['Description']

    if invoice_no not in invoices: 
        invoices[invoice_no] = set()

    invoices[invoice_no].add(description)

print(len(invoices))
In [ ]:
from collections import defaultdict

# Create a defaultdict where each key maps to an empty set by default
invoices = defaultdict(set)

# Loop over the DataFrame rows
for _, row in df.iterrows():
    invoices[row['InvoiceNo']].add(row['Description'])

print(len(invoices))
In [ ]:
invoices = (
    df.groupby('InvoiceNo')['Description'].apply(lambda x: set(x.dropna()))
      .to_dict()
)

print(len(invoices))
In [ ]:
invoices
In [ ]:
# Colletion of unique item 

all_items_set = set() #duplicates are discarded 
for items in invoices.values(): #iterate over the dictionary 
    all_items_set.update(items) #insert the element on the set 
len(all_items_set)

all_items = sorted(list(all_items_set)) #conversion to a list and order them 
In [ ]:
len(all_items)
In [ ]:
## OPTION 1 EXTENDED ## 
import numpy as np 

invoice_sets = list(invoices.values())

# Step 2: create an empty list to hold each invoice's row
presence_rows = []

# Step 3: loop through each invoice
for invoice in invoice_sets:
    # for each invoice, create a list to hold the 0/1 indicators
    row = []

    # Step 4: loop through each item in all_items
    for item in all_items:
        # if the item is in the current invoice, append 1, else 0
        if item in invoice:
            row.append(1)
        else:
            row.append(0)
    
    # Step 5: after building the row, append it to the list of rows
    presence_rows.append(row)

# Step 6: convert the list of lists into a NumPy array
presence_matrix = np.array(presence_rows, dtype=int)

# Step 7: display the matrix
print("Presence matrix:\n", presence_matrix)
In [ ]:
## OPTION 1 ## 
presence_matrix = np.array(
    [[int(item in invoice) for item in all_items] for invoice in invoice_sets],
    dtype=int
)

presence_matrix
In [ ]:
### OPTION 2 EXTENDED ### 

# Step 2: map each item to its column index
item_index = {}
for i, item in enumerate(all_items):
    item_index[item] = i
print("Item index mapping:", item_index)

# Step 3: initialize an empty matrix of zeros
n_invoices = len(invoices)
n_items = len(all_items)
presence_matrix = np.zeros((n_invoices, n_items), dtype=int)
print("\nInitial matrix of zeros:\n", presence_matrix)

# Step 4: fill in the matrix
for i, (invoice_name, invoice_items) in enumerate(invoices.items()):
    print(f"\nProcessing {invoice_name} with items:", invoice_items)
    
    # loop through all items in this invoice
    for item in invoice_items:
        if item in item_index:
            j = item_index[item]  # column index for this item
            presence_matrix[i, j] = 1  # mark presence as 1
            print(f"  -> Marking presence_matrix[{i}, {j}] = 1 for item '{item}'")

# Step 5: display the completed matrix
print("\nFinal presence matrix:\n", presence_matrix)
In [ ]:
# Map each item to its column index
item_index = {item: i for i, item in enumerate(all_items)} # which column corresponds to each product 

# Initialize an empty 0/1 matrix
presence_matrix = np.zeros((len(invoices), len(all_items)), dtype=int) # create a numpy array of zeros with N(invoices) rows and M(products) columns

# Fill it in
for i, invoice in enumerate(invoices.values()): #iterate over each invoice's set of items keeping track of the invoioce's row index i
    cols = [item_index[item] for item in invoice if item in item_index]
    presence_matrix[i, cols] = 1

presence_matrix
In [ ]:
df = pd.DataFrame(data=presence_matrix, columns=all_items)
df
In [ ]:
import warnings
warnings.filterwarnings('ignore')

from mlxtend.frequent_patterns import fpgrowth

list_of_minsup = [0.5, 0.1, 0.05, 0.02, 0.01]

for msp in list_of_minsup: 
    fi = fpgrowth(df, msp)
    print(f"{msp} -> Number of elements:", len(fi))
In [ ]:
fpgrowth(df, 0.02)
In [ ]:
freq_itemsets = fpgrowth(df, 0.02)
freq_itemsets[freq_itemsets["itemsets"].map(len) > 1]
In [ ]:
M = df.values # matrix from the df dataframe

col_a, col_b = 1857, 1855 #select the columns

support_a = M[:, col_a].mean() #mean works because True = 1 and False = 0, so it gives directly the proportions
support_b = M[:, col_b].mean()
support_both = np.mean((M[:, col_a] == 1) & (M[:, col_b] == 1))

confidence_a_to_b = support_both / support_a
confidence_b_to_a = support_both / support_b

print(f"Confidence {col_a} => {col_b}: {confidence_a_to_b:.3f}")
print(f"Confidence {col_b} => {col_a}: {confidence_b_to_a:.3f}")
In [ ]:
from mlxtend.frequent_patterns import  association_rules

# 1) Ensure your input df to fpgrowth is one-hot boolean (no ints)
#    If it's 0/1 ints, convert to bool:
df_bool = df.astype(bool)

# 2) Mine frequent itemsets (use column names in the itemsets)
fi = fpgrowth(df_bool, min_support=0.01, use_colnames=True)

# 3) Generate rules, filtering by confidence >= 0.85
rules = association_rules(fi, metric='confidence', min_threshold=0.85)
rules.head()
In [ ]:
# === 2.2 Apriori implementation ===

transactions = [
    ["a","b"],
    ["b","c","d"],
    ["a","c","d","e"],
    ["a","d","e"],
    ["a","b","c"],
    ["a","b","c","d"],
    ["b","c"],
    ["a","b","c"],
    ["a","b","d"],
    ["b","c","e"]
]
In [ ]:
from collections import defaultdict # automatically assigns default to element that does not exists

freq = defaultdict(lambda: 0)
minsup = 1
for t in transactions:
    for el in t:
        freq[el] += 1
dict(freq)
In [ ]:
### CREATION OF L EXTENDED ###
L = []

# Step 1: Create an empty dictionary to store the frequent itemsets of this level
new_level = {}

# Step 2: Loop through all itemsets and their frequencies in 'freq'
for k, v in freq.items():
    print(f"Checking itemset {k} with frequency {v}")

    # Step 3: Keep only those itemsets whose frequency meets or exceeds the minimum support
    if v >= minsup:
        new_level[(k,)] = v  # store as a tuple key (e.g., ('apple',)) to keep consistent structure
        print(f"  -> Itemset {k} is frequent (v >= minsup). Added to new level.")

    else:
        print(f"  -> Itemset {k} is infrequent (v < minsup). Discarded.")

# Step 4: Append this new dictionary of frequent itemsets to the main list L
L.append(new_level)

# Step 5: Print confirmation
print("\nNew level of frequent itemsets added to L:")
for itemset, count in new_level.items():
    print(f"  {itemset}: {count}")
In [ ]:
def generate_candidates(L):
    """
    Generate candidate itemsets (C_{k+1}) from a list of frequent itemsets (L_k).

    Each element of L is assumed to be a tuple of items (sorted).
    The function performs a self-join: it combines pairs of itemsets
    that share the same prefix (all but the last item) to form larger candidates.
    """

    # Step 1: initialize the list that will store new candidate itemsets
    C = []

    print("Step 1 — Starting candidate generation")
    print("Input frequent itemsets (L):")
    for itemset in L:
        print(" ", itemset)

    # Step 2: iterate through all unique pairs of itemsets in L
    print("\nStep 2 — Comparing pairs of itemsets:")
    for i in range(len(L)):
        for j in range(i + 1, len(L)):
            print(f"\nComparing L[{i}] = {L[i]} and L[{j}] = {L[j]}")

            # Step 3: check if their prefixes (all items except the last one) are identical
            if L[i][:-1] == L[j][:-1]:
                print("  -> Prefixes match:", L[i][:-1])

                # Step 4: combine them into a new candidate
                # We keep the shared prefix and append the two different last items, sorted
                new_candidate = L[i][:-1] + tuple(sorted([L[i][-1], L[j][-1]]))

                # Step 5: add this candidate to the list
                C.append(new_candidate)
                print("  -> Generated new candidate:", new_candidate)
            else:
                print("  -> Prefixes differ, no candidate generated.")

    # Step 6: print and return the complete candidate list
    print("\nStep 6 — Final candidate list:")
    for candidate in C:
        print(" ", candidate)

    return C
In [ ]:
def generate_candidates(L):
    C = []
    for i in range(len(L)):
        for j in range(i+1, len(L)):
            # iterate over all possible pairs in L
            if L[i][:-1] == L[j][:-1]:
                # L[i] and L[j] have matching prefixes (they only differ by the last item),
                # so we generate the new element by using the same prefix and appending the two suffixes
                C.append(L[i][:-1] + tuple(sorted([L[i][-1], L[j][-1]])))
    return C
In [ ]:
# expecting ab, ac, ad, bc, bd, cd
print(generate_candidates([ ('a','b'), ('a','c'), ('b','c'), ('c','f'), ('c','d'), ('c','e')]))
In [ ]:
from itertools import combinations

def prune_values(freq, candidates):
    keep = []
    for candidate in candidates:
        combs = list(combinations(candidate, len(candidate)-1))
        if all([ comb in freq for comb in combs ]):
            keep.append(candidate)
    return keep
In [ ]:
from collections import defaultdict

def my_apriori(transactions, minsup):
    # step 1 - generation of the frequencies 
    freq = defaultdict(lambda: 0)
    for t in transactions:
        for el in t:
            freq[el] += 1
    L = []
    L.append({ (k,): v for k,v in freq.items() if v >= minsup })
    while L[-1] != {}: # L[-1] is the last element 
        C = prune_values(L[-1], generate_candidates(list(L[-1].keys()))) # step 2
        # step 3 
        freq = defaultdict(lambda: 0)
        for t in transactions:
            transaction_set = set(t)
            for c in C:
                if set(c).issubset(transaction_set):
                    freq[c] += 1
        L.append({ k: v for k,v in freq.items() if v >= minsup }) # step 4
    return L[:-1] # do not return last element, as we know it is {}
In [ ]:
my_apriori(transactions, 2)
In [ ]:
import requests
import json

# 1. Load JSON from URL
url = "https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/modified_coco.json"
response = requests.get(url)
images = response.json()   # parse the JSON content

# 2. Build dataset: list of unique annotations per image
dataset = [list(set(image["annotations"])) for image in images]

# 3. Check how many images you have
print(f"Number of images: {len(dataset)}")
In [ ]:
L = my_apriori(dataset, 0.02 * len(dataset))

for k, L_k in enumerate(L[1:]):
    print(k+2, list(L_k.keys())[:10])
In [ ]:
import pandas as pd 

all_items_set = set()
for items in dataset:
    all_items_set.update(items)
all_items = sorted(list(all_items_set))
presence_matrix = [ [ int(item in image) for item in all_items ] for image in dataset ]
df = pd.DataFrame(presence_matrix, columns=all_items)
In [ ]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpgrowth

fi_ap = apriori(df, 0.02)
fi_fp = fpgrowth(df, 0.02)
fi_my_ap = my_apriori(dataset, 0.02*5000)
In [ ]:
tuples_ap = { tuple(row) for row in fi_ap.values }
tuples_fp = { tuple(row) for row in fi_fp.values}
tuples_ap == tuples_fp
In [ ]:
fi_myap = set()
for L_k in L:
    # - v/5000 converts the support from absolute to relative
    # - frozenset() builds an immutable set
    # - { all_items.index(k_) for k_ in k } builds a set where
    #   the elements are the indices (in the list all_items)
    #   of the elements in k 
    fi_myap.update({ (v/5000, frozenset({all_items.index(k_) for k_ in k})) for k,v in L_k.items() })
In [ ]:
tuples_ap == fi_myap
In [ ]:
import warnings
warnings.filterwarnings('ignore')

from timeit import timeit
print("fpgrowth", timeit(lambda: fpgrowth(df, 0.005), number=1))
print("apriori", timeit(lambda: apriori(df, 0.005), number=1))
print("my_apriori", timeit(lambda: my_apriori(dataset, 0.005 * 5000), number=1))