Data Science and Machine Learning Lab - Lab #1: NumPy Introduction¶
1. Datasets Download¶
In [ ]:
# IRIS
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" -O iris.csv
# MNIST (CSV subset)
!wget "https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/mnist_test.csv" -O mnist_test.csv
2. Warm-up with NumPy¶
In [2]:
import numpy as np
# === 2.1 Array creation and attributes ===
a = np.array([[1,2,3],[4,5,6]], dtype=np.int32)
zeros = np.zeros((2,3), dtype=float)
ones = np.ones((3,1), dtype=np.float64)
full = np.full((2,2), fill_value=-7)
lin = np.linspace(0, 1, 5)
arr = np.arange(0, 10, 2)
out = {
"a.ndim": a.ndim, "a.shape": a.shape, "a.size": a.size, "a.dtype": a.dtype,
"zeros.shape": zeros.shape, "ones.dtype": ones.dtype,
"lin": lin, "arr": arr
}
out
Out[2]:
{'a.ndim': 2, 'a.shape': (2, 3), 'a.size': 6, 'a.dtype': dtype('int32'), 'zeros.shape': (2, 3), 'ones.dtype': dtype('float64'), 'lin': array([0. , 0.25, 0.5 , 0.75, 1. ]), 'arr': array([0, 2, 4, 6, 8])}
In [4]:
a = np.array([[1,2,3],[4,5,6]], dtype=np.int32)
b = np.array([[1,2,3],[4,5,6]], dtype=np.int64)
c = np.array([[1,2,3],[4,5,6]], dtype=np.float64)
print(a.nbytes, b.nbytes, c.nbytes)
24 48 48
In [3]:
# === 2.2 Universal functions, aggregations, and sorting ===
x = np.array([[1.,1.],[2.,2.]])
y = np.array([[3.,4.],[6.,5.]])
prod = x * y # element-wise
exp = np.exp(x) # ufunc
sum0 = x.sum(axis=0) # along rows (columns result)
mean = x.mean() # global mean
std = x.std() # global std
mean1 = x.mean(axis=1) # along columns (rows result)
std1 = x.std(axis=1) # along columns (rows result)
print(prod)
print(exp)
print(sum0)
print(mean)
print(std)
print(mean1)
print(std1)
[[ 3. 4.] [12. 10.]] [[2.71828183 2.71828183] [7.3890561 7.3890561 ]] [3. 3.] 1.5 0.5 [1. 2.] [0. 0.]
In [4]:
arg = y.argmax() # index of max in flattened array
arg_row = y.argmax(axis=1) # index of max in each row
arg_column = y.argmax(axis=0) # index of max in each column
print(arg)
print(arg_row)
print(arg_column)
2 [1 0] [1 1]
In [5]:
# Sorting & ranking
m = np.array([[2,7,3],[7,2,1]])
sort_row = np.sort(m, axis=1) # sort each row independently
args_row = np.argsort(m, axis=1) # indices that sort each row
print(sort_row)
print(args_row)
[[2 3 7] [1 2 7]] [[0 2 1] [2 1 0]]
In [6]:
# === 2.3 Broadcasting (column-wise normalization) ===
D = np.array([[0,3,6,6],
[1,2,5,7],
[2,4,4,8]], dtype=np.float64)
print(D)
[[0. 3. 6. 6.] [1. 2. 5. 7.] [2. 4. 4. 8.]]
In [7]:
mn = D.min(axis=0) # shape (4,)
mx = D.max(axis=0) # shape (4,)
print(mn)
print(mx)
[0. 2. 4. 6.] [2. 4. 6. 8.]
In [8]:
norm = (D - mn) / (mx - mn) # broadcasting over rows
print(norm)
[[0. 0.5 1. 0. ] [0.5 0. 0.5 0.5] [1. 1. 0. 1. ]]
In [ ]:
# === 2.4 Indexing and views vs copies ===
A = np.arange(1, 13).reshape(3, 4)
print(A)
[[ 1 2 3 4] [ 5 6 7 8] [ 9 10 11 12]]
In [10]:
b = A[:2, :2] # top-left 2x2 subarray
c = A[:, -2:] # last two columns
print(b)
print(c)
[[1 2] [5 6]] [[ 3 4] [ 7 8] [11 12]]
In [11]:
mask = A > 6 # boolean mask
A[mask] = -1 # set values > 6 to -1
print(A)
[[ 1 2 3 4] [ 5 6 -1 -1] [-1 -1 -1 -1]]
In [12]:
A = np.arange(1, 13).reshape(3, 4)
rows_view = A[1:3] # view
print(rows_view)
[[ 5 6 7 8] [ 9 10 11 12]]
In [13]:
rows_view[:] = 999
print(A)
[[ 1 2 3 4] [999 999 999 999] [999 999 999 999]]
In [14]:
rows_copy = A[1:3].copy()
rows_copy[:] = 555
print(A)
[[ 1 2 3 4] [999 999 999 999] [999 999 999 999]]
In [15]:
sub = A[[0, 2]][:, [1, 3]] # rows 0 & 2, cols 1 & 3
print(sub)
[[ 2 4] [999 999]]
3. Iris Dataset with NumPy¶
In [16]:
# === 3.1 Feature extraction and statistical analysis ===
X = np.genfromtxt('iris.csv', delimiter=',', usecols=(0, 1, 2, 3), dtype=float)
y = np.genfromtxt('iris.csv', delimiter=',', usecols=4, dtype=str)
print(X.shape), print(y.shape), print(np.unique(y))
(150, 4) (150,) ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica'] ['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
Out[16]:
(None, None, None)
In [17]:
# Computing global statistics for each feature
feature_names = np.array(["sepal_len","sepal_wid","petal_len","petal_wid"])
mu = X.mean(axis=0)
sd = X.std(axis=0)
for name, m, s in zip(feature_names, mu, sd):
print(f"{name}: mean={m:.2f}, std={s:.2f}")
sepal_len: mean=5.84, std=0.83 sepal_wid: mean=3.05, std=0.43 petal_len: mean=3.76, std=1.76 petal_wid: mean=1.20, std=0.76
In [18]:
# Analyzing statistics by species:
classes = np.unique(y)
for c in classes:
mask = (y == c)
mean = X[mask].mean(axis=0)
std = X[mask].std(axis=0)
print(c, mean, std)
Iris-setosa [5.006 3.418 1.464 0.244] [0.34894699 0.37719491 0.17176728 0.10613199] Iris-versicolor [5.936 2.77 4.26 1.326] [0.51098337 0.31064449 0.46518813 0.19576517] Iris-virginica [6.588 2.974 5.552 2.026] [0.62948868 0.31925538 0.54634787 0.27188968]
In [19]:
# === 3.2 Feature standardization ===
X_mean = X.mean(axis=0)
X_std = X.std(axis=0)
Z = (X - X_mean) / X_std
Z_mean = Z.mean(axis=0)
Z_std = Z.std(axis=0)
print(Z_mean)
print(Z_std)
print(Z.shape)
[-1.69031455e-15 -1.63702385e-15 -1.48251781e-15 -1.62314606e-15] [1. 1. 1. 1.] (150, 4)
In [ ]:
# === 3.3 (*) Naïve single-feature classifier ===
# Choose feature with the largest *minimum* pairwise distance among class means
classes = np.unique(y)
means = np.vstack([X[y==c].mean(axis=0) for c in classes]) # vertical stack of class means
print(means)
[[5.006 3.418 1.464 0.244] [5.936 2.77 4.26 1.326] [6.588 2.974 5.552 2.026]]
In [21]:
from itertools import combinations
# Get all unique pairs of class indices
class_pairs = np.array(list(combinations(range(len(classes)), 2)))
# Compute differences for all pairs and features at once
pair_diffs = np.abs(means[class_pairs[:, 0], :] - means[class_pairs[:, 1], :])
# Find minimum separation for each feature
min_sep_by_feat = np.min(pair_diffs, axis=0)
print(min_sep_by_feat)
[0.652 0.204 1.292 0.7 ]
In [22]:
# Select best feature
best_feat = int(np.argmax(min_sep_by_feat))
best_feat_name = feature_names[best_feat]
print(f"Best feature: {best_feat} ({best_feat_name})")
Best feature: 2 (petal_len)
In [23]:
# Simple thresholding rule on best feature: midpoints between class means (sorted)
m = means[:, best_feat]
order = np.argsort(m)
m_sorted = m[order]
classes_sorted = classes[order]
print(m_sorted)
# two midpoints separating 3 classes
t1 = 0.5*(m_sorted[0] + m_sorted[1])
t2 = 0.5*(m_sorted[1] + m_sorted[2])
print(t1, t2)
# extract the feature and apply the thresholds
feature_values = X[:, best_feat]
predictions = np.where(feature_values < t1, 0,
np.where(feature_values < t2, 1, 2))
# map back to original class names
predictions_classes = classes_sorted[predictions]
acc = (predictions_classes == y).mean()
print(acc)
[1.464 4.26 5.552] 2.862 4.906 0.9466666666666667
4. MNIST Dataset with NumPy¶
In [24]:
# === 4.1 Loading and exploring the data ===
raw = np.loadtxt("mnist_test.csv", delimiter=",", dtype=np.int16)
y = raw[:, 0]
X = raw[:, 1:]
X.shape, y.shape
Out[24]:
((10000, 784), (10000,))
In [25]:
# === 4.2 Visual inspection of samples ===
def ascii_render(vec, rows=28, cols=28):
img = vec.reshape(rows, cols)
chars = np.empty_like(img, dtype=str)
chars[img < 64] = " "
chars[(img >= 64) & (img < 128)] = "."
chars[(img >= 128) & (img < 192)] = "*"
chars[img >= 192] = "#"
return "\n".join("".join(row) for row in chars)
print(ascii_render(X[45]))
*#*. #########. #. . .. * #. #. .#. *#. ######. .#######. ##* ## *#* *#. * #. .#. .* ##. #* ## #. ##* #. *## *#. .*##* #####*. *#.
In [26]:
# === 4.3 Pixel frequency comparison by class ===
# Get the samples of digits '0' and '1'
mask0 = (y == 0)
mask1 = (y == 1)
X_0 = X[mask0]
X_1 = X[mask1]
print(X_0.shape, X_1.shape)
(980, 784) (1135, 784)
In [27]:
# For each pixel, count how many '0' and '1' images have pixel value >= 128
map_0 = (X_0 >= 128).sum(axis=0)
map_1 = (X_1 >= 128).sum(axis=0)
diff = np.abs(map_0 - map_1)
# Find the pixel with the largest difference
p = int(np.argmax(diff))
r = p // 28
c = p % 28
print("best_pixel_index", p, (r,c))
best_pixel_index 406 (14, 14)
In [28]:
# === 4.4 (*) Pairwise distance analysis ===
idxs = [25, 29, 31, 34]
V = X[idxs].astype(float)
In [29]:
# The Gram matrix G[i,j] = V[i] · V[j] (dot product between images i and j)
# This is computed efficiently using matrix multiplication: V @ V.T
G = V @ V.T
print(G)
[[12313079. 1280304. 980909. 3891842.] [ 1280304. 2773630. 1848654. 1113660.] [ 980909. 1848654. 2296862. 914244.] [ 3891842. 1113660. 914244. 5859668.]]
In [30]:
# For each image i, compute ||V[i]||² = sum of squared pixel values
# axis=1 sums across pixels (columns), keepdims=True maintains shape (4,1)
nrm = np.sum(V*V, axis=1, keepdims=True)
print(nrm)
[[12313079.] [ 2773630.] [ 2296862.] [ 5859668.]]
In [31]:
# Use the identity: ||a - b||² = ||a||² + ||b||² - 2(a·b)
# Broadcasting: nrm (4,1) + nrm.T (1,4) - 2*G (4,4) = (4,4)
D2 = nrm + nrm.T - 2*G
D = np.sqrt(D2)
print(D)
[[ 0. 3539.22321986 3556.41996958 3223.2069434 ] [3539.22321986 0. 1171.82933911 2531.00335835] [3556.41996958 1171.82933911 0. 2515.55997742] [3223.2069434 2531.00335835 2515.55997742 0. ]]