import mlx.core as mx
import numpy as np
import struct

# Data loading functions. To download the data run the data.sh script.
def load_images(filename):
    with open(filename, 'rb') as f:
        magic, num_images, rows, cols = struct.unpack('>IIII', f.read(16))
        data = np.frombuffer(f.read(), dtype=np.uint8)
        images = data.reshape(num_images, rows, cols)
        array = np.array(images / 255.0)
        return np.reshape(array, (array.shape[0], array.shape[1] * array.shape[2]))

def load_labels(filename):
    with open(filename, 'rb') as f:
        magic, num_labels = struct.unpack('>II', f.read(8))
        labels = np.frombuffer(f.read(), dtype=np.uint8)
        return labels

# Softmax distribution function. Given a vector,
# the function gives another vector where the sum
# of each element equals 1.0. In this case the
# softmax is stabilized, to prevent getting an
# overflow, by subtracting the maximum value of
# the vector to each one of its elements. This
# will get all negative values without changing
# the end result since each element is getting
# shifted by the same amount.
def softmax(x):
    stable_x = x - mx.max(x, axis=1, keepdims=True)
    e = mx.exp(stable_x)
    return e / mx.sum(e, axis=1, keepdims=True)

# ReLU activation function. If x > 0 returns
# x, otherwise returns 0.0. This is needed
# to make the neural network non-linear.
def relu(x):
    return mx.maximum(0.0, x)

# Linear prediction function. Given input, weights
# and bias it computes the prediction. Should be
# activated using ReLU before passing it to the
# next layer or distributed with softmax when
# using it in the output layer.
def linear(x, W, b):
    return x @ W + b