Data Science and Machine Learning Lab - Lab #1: NumPy Introduction¶

1. Datasets Download¶

In [ ]:
# IRIS
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" -O iris.csv

# MNIST (CSV subset)
!wget "https://raw.githubusercontent.com/dbdmg/data-science-lab/master/datasets/mnist_test.csv" -O mnist_test.csv

2. Warm-up with NumPy¶

In [2]:
import numpy as np

# === 2.1 Array creation and attributes ===
a = np.array([[1,2,3],[4,5,6]], dtype=np.int32)
zeros = np.zeros((2,3), dtype=float)
ones  = np.ones((3,1), dtype=np.float64)
full  = np.full((2,2), fill_value=-7)
lin   = np.linspace(0, 1, 5)
arr   = np.arange(0, 10, 2)

out = {
    "a.ndim": a.ndim, "a.shape": a.shape, "a.size": a.size, "a.dtype": a.dtype,
    "zeros.shape": zeros.shape, "ones.dtype": ones.dtype,
    "lin": lin, "arr": arr
}
out
Out[2]:
{'a.ndim': 2,
 'a.shape': (2, 3),
 'a.size': 6,
 'a.dtype': dtype('int32'),
 'zeros.shape': (2, 3),
 'ones.dtype': dtype('float64'),
 'lin': array([0.  , 0.25, 0.5 , 0.75, 1.  ]),
 'arr': array([0, 2, 4, 6, 8])}
In [4]:
a = np.array([[1,2,3],[4,5,6]], dtype=np.int32)
b = np.array([[1,2,3],[4,5,6]], dtype=np.int64)
c = np.array([[1,2,3],[4,5,6]], dtype=np.float64)

print(a.nbytes, b.nbytes, c.nbytes)
24 48 48
In [3]:
# === 2.2 Universal functions, aggregations, and sorting ===
x = np.array([[1.,1.],[2.,2.]])
y = np.array([[3.,4.],[6.,5.]])

prod = x * y               # element-wise
exp  = np.exp(x)           # ufunc
sum0 = x.sum(axis=0)       # along rows (columns result)
mean = x.mean()              # global mean
std = x.std()              # global std
mean1 = x.mean(axis=1)      # along columns (rows result)
std1 = x.std(axis=1)        # along columns (rows result)

print(prod)
print(exp)
print(sum0)
print(mean)
print(std)
print(mean1)
print(std1)
[[ 3.  4.]
 [12. 10.]]
[[2.71828183 2.71828183]
 [7.3890561  7.3890561 ]]
[3. 3.]
1.5
0.5
[1. 2.]
[0. 0.]
In [4]:
arg = y.argmax()           # index of max in flattened array
arg_row = y.argmax(axis=1)    # index of max in each row
arg_column = y.argmax(axis=0)   # index of max in each column

print(arg)
print(arg_row)
print(arg_column)
2
[1 0]
[1 1]
In [5]:
# Sorting & ranking
m = np.array([[2,7,3],[7,2,1]])
sort_row = np.sort(m, axis=1)   # sort each row independently
args_row = np.argsort(m, axis=1) # indices that sort each row
print(sort_row)
print(args_row)
[[2 3 7]
 [1 2 7]]
[[0 2 1]
 [2 1 0]]
In [6]:
# === 2.3 Broadcasting (column-wise normalization) ===
D = np.array([[0,3,6,6],
              [1,2,5,7],
              [2,4,4,8]], dtype=np.float64)

print(D)
[[0. 3. 6. 6.]
 [1. 2. 5. 7.]
 [2. 4. 4. 8.]]
In [7]:
mn = D.min(axis=0)        # shape (4,)
mx = D.max(axis=0)        # shape (4,)

print(mn)
print(mx)
[0. 2. 4. 6.]
[2. 4. 6. 8.]
In [8]:
norm = (D - mn) / (mx - mn)  # broadcasting over rows
print(norm)
[[0.  0.5 1.  0. ]
 [0.5 0.  0.5 0.5]
 [1.  1.  0.  1. ]]
In [ ]:
# === 2.4 Indexing and views vs copies ===
A = np.arange(1, 13).reshape(3, 4)
print(A)
[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
In [10]:
b = A[:2, :2] # top-left 2x2 subarray
c = A[:, -2:] # last two columns

print(b)
print(c)
[[1 2]
 [5 6]]
[[ 3  4]
 [ 7  8]
 [11 12]]
In [11]:
mask = A > 6 # boolean mask
A[mask] = -1 # set values > 6 to -1
print(A)
[[ 1  2  3  4]
 [ 5  6 -1 -1]
 [-1 -1 -1 -1]]
In [12]:
A = np.arange(1, 13).reshape(3, 4)

rows_view = A[1:3]     # view
print(rows_view)
[[ 5  6  7  8]
 [ 9 10 11 12]]
In [13]:
rows_view[:] = 999
print(A)
[[  1   2   3   4]
 [999 999 999 999]
 [999 999 999 999]]
In [14]:
rows_copy = A[1:3].copy() 
rows_copy[:] = 555      
print(A)
[[  1   2   3   4]
 [999 999 999 999]
 [999 999 999 999]]
In [15]:
sub = A[[0, 2]][:, [1, 3]]  # rows 0 & 2, cols 1 & 3
print(sub)
[[  2   4]
 [999 999]]

3. Iris Dataset with NumPy¶

In [16]:
# === 3.1 Feature extraction and statistical analysis ===
X = np.genfromtxt('iris.csv', delimiter=',', usecols=(0, 1, 2, 3), dtype=float)
y = np.genfromtxt('iris.csv', delimiter=',', usecols=4, dtype=str)

print(X.shape), print(y.shape), print(np.unique(y))
(150, 4)
(150,)
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
['Iris-setosa' 'Iris-versicolor' 'Iris-virginica']
Out[16]:
(None, None, None)
In [17]:
# Computing global statistics for each feature
feature_names = np.array(["sepal_len","sepal_wid","petal_len","petal_wid"])
mu = X.mean(axis=0)
sd = X.std(axis=0)
for name, m, s in zip(feature_names, mu, sd):
    print(f"{name}: mean={m:.2f}, std={s:.2f}")
sepal_len: mean=5.84, std=0.83
sepal_wid: mean=3.05, std=0.43
petal_len: mean=3.76, std=1.76
petal_wid: mean=1.20, std=0.76
In [18]:
# Analyzing statistics by species:
classes = np.unique(y)
for c in classes:
    mask = (y == c)
    mean = X[mask].mean(axis=0)
    std = X[mask].std(axis=0)
    print(c, mean, std)
Iris-setosa [5.006 3.418 1.464 0.244] [0.34894699 0.37719491 0.17176728 0.10613199]
Iris-versicolor [5.936 2.77  4.26  1.326] [0.51098337 0.31064449 0.46518813 0.19576517]
Iris-virginica [6.588 2.974 5.552 2.026] [0.62948868 0.31925538 0.54634787 0.27188968]
In [19]:
# === 3.2 Feature standardization ===
X_mean = X.mean(axis=0)
X_std  = X.std(axis=0)
Z = (X - X_mean) / X_std
Z_mean = Z.mean(axis=0)
Z_std  = Z.std(axis=0)

print(Z_mean)
print(Z_std)
print(Z.shape)
[-1.69031455e-15 -1.63702385e-15 -1.48251781e-15 -1.62314606e-15]
[1. 1. 1. 1.]
(150, 4)
In [ ]:
# === 3.3 (*) Naïve single-feature classifier ===
# Choose feature with the largest *minimum* pairwise distance among class means
classes = np.unique(y)
means = np.vstack([X[y==c].mean(axis=0) for c in classes])  # vertical stack of class means 
print(means)
[[5.006 3.418 1.464 0.244]
 [5.936 2.77  4.26  1.326]
 [6.588 2.974 5.552 2.026]]
In [21]:
from itertools import combinations
# Get all unique pairs of class indices
class_pairs = np.array(list(combinations(range(len(classes)), 2)))
# Compute differences for all pairs and features at once
pair_diffs = np.abs(means[class_pairs[:, 0], :] - means[class_pairs[:, 1], :])
# Find minimum separation for each feature
min_sep_by_feat = np.min(pair_diffs, axis=0)
print(min_sep_by_feat)
[0.652 0.204 1.292 0.7  ]
In [22]:
# Select best feature
best_feat = int(np.argmax(min_sep_by_feat))
best_feat_name = feature_names[best_feat]
print(f"Best feature: {best_feat} ({best_feat_name})")
Best feature: 2 (petal_len)
In [23]:
# Simple thresholding rule on best feature: midpoints between class means (sorted)
m = means[:, best_feat]
order = np.argsort(m)
m_sorted = m[order]
classes_sorted = classes[order]
print(m_sorted)

# two midpoints separating 3 classes
t1 = 0.5*(m_sorted[0] + m_sorted[1])
t2 = 0.5*(m_sorted[1] + m_sorted[2])
print(t1, t2)

# extract the feature and apply the thresholds
feature_values = X[:, best_feat]
predictions = np.where(feature_values < t1, 0,
               np.where(feature_values < t2, 1, 2))

# map back to original class names
predictions_classes = classes_sorted[predictions]

acc = (predictions_classes == y).mean()
print(acc)
[1.464 4.26  5.552]
2.862 4.906
0.9466666666666667

4. MNIST Dataset with NumPy¶

In [24]:
# === 4.1 Loading and exploring the data ===
raw = np.loadtxt("mnist_test.csv", delimiter=",", dtype=np.int16)
y  = raw[:, 0]
X  = raw[:, 1:]

X.shape, y.shape
Out[24]:
((10000, 784), (10000,))
In [25]:
# === 4.2 Visual inspection of samples ===
def ascii_render(vec, rows=28, cols=28):
    img = vec.reshape(rows, cols)
    
    chars = np.empty_like(img, dtype=str)
    
    chars[img < 64] = " "
    chars[(img >= 64) & (img < 128)] = "."
    chars[(img >= 128) & (img < 192)] = "*"
    chars[img >= 192] = "#"
    
    return "\n".join("".join(row) for row in chars)

print(ascii_render(X[45]))
                            
                            
                            
                            
                            
            *#*.            
            #########.      
            #.  . .. *      
            #.              
            #.              
           .#.              
           *#.              
           ######.          
          .#######.         
          ##*    ##         
         *#*     *#.        
          *       #.        
                 .#.        
        .*       ##.        
        #*       ##         
        #.      ##*         
        #.     *##          
        *#. .*##*           
         #####*.            
          *#.               
                            
                            
                            
In [26]:
# === 4.3 Pixel frequency comparison by class === 
# Get the samples of digits '0' and '1'
mask0 = (y == 0)
mask1 = (y == 1)

X_0 = X[mask0]
X_1 = X[mask1]

print(X_0.shape, X_1.shape)
(980, 784) (1135, 784)
In [27]:
# For each pixel, count how many '0' and '1' images have pixel value >= 128
map_0 = (X_0 >= 128).sum(axis=0)
map_1 = (X_1 >= 128).sum(axis=0)
diff = np.abs(map_0 - map_1)

# Find the pixel with the largest difference
p = int(np.argmax(diff))
r = p // 28
c = p % 28
print("best_pixel_index", p, (r,c))
best_pixel_index 406 (14, 14)
In [28]:
# === 4.4 (*) Pairwise distance analysis ===
idxs = [25, 29, 31, 34]
V = X[idxs].astype(float)
In [29]:
# The Gram matrix G[i,j] = V[i] · V[j] (dot product between images i and j)
# This is computed efficiently using matrix multiplication: V @ V.T
G = V @ V.T
print(G)
[[12313079.  1280304.   980909.  3891842.]
 [ 1280304.  2773630.  1848654.  1113660.]
 [  980909.  1848654.  2296862.   914244.]
 [ 3891842.  1113660.   914244.  5859668.]]
In [30]:
# For each image i, compute ||V[i]||² = sum of squared pixel values
# axis=1 sums across pixels (columns), keepdims=True maintains shape (4,1)
nrm = np.sum(V*V, axis=1, keepdims=True)
print(nrm)
[[12313079.]
 [ 2773630.]
 [ 2296862.]
 [ 5859668.]]
In [31]:
# Use the identity: ||a - b||² = ||a||² + ||b||² - 2(a·b)
# Broadcasting: nrm (4,1) + nrm.T (1,4) - 2*G (4,4) = (4,4)
D2 = nrm + nrm.T - 2*G 
D = np.sqrt(D2)
print(D)
[[   0.         3539.22321986 3556.41996958 3223.2069434 ]
 [3539.22321986    0.         1171.82933911 2531.00335835]
 [3556.41996958 1171.82933911    0.         2515.55997742]
 [3223.2069434  2531.00335835 2515.55997742    0.        ]]