MNIST Benchmark Parity: Sorix vs. TensorFlow vs. PyTorch¶

This notebook compares the performance of Sorix with industry-leading frameworks like PyTorch and TensorFlow using the MNIST (Digit Recognizer) dataset. We will measure:

Training Time (Total across multiple epochs).
Accuracy on the test set.
Inference Speed (Average time per batch).

Experimental Setup¶

Personal Workstation Specs: Intel Core i9 (32 cores), 64GB RAM (This justifies the extremely fast CPU computation).
Architecture: 3-layer MLP with BatchNorm and Dropout.
Optimizer: RMSprop (lr=1e-3, alpha/rho=0.99).
Loss: CrossEntropy.
Hardware: Comparisons explicitly performed on both CPU and GPU.

In [1]:

Copied!

# Uncomment the next line and run this cell to install sorix
#!pip install 'sorix @ git+https://github.com/Mitchell-Mirano/sorix.git@main'
# Uncomment the next line and run this cell to install sorix
#!pip install 'sorix @ git+https://github.com/Mitchell-Mirano/sorix.git@main'

In [6]:

Copied!





import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set seeds for reproducibility
def seed_everything(seed=42):
    np.random.seed(seed)
    try: import torch; torch.manual_seed(seed)
    except: pass
    try: import tensorflow as tf; tf.random.set_seed(seed)
    except: pass

DATA_PATH = "../data/digit-recognizer/train.csv"
print(f"Using data from: {os.path.abspath(DATA_PATH)}")
data = pd.read_csv(DATA_PATH)
SEED = 42
EPOCHS = 10
TRAIN_BATCH_SIZE = 128

# Logging CPU Info
import multiprocessing
print(f"CPU Cores available: {multiprocessing.cpu_count()}")
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set seeds for reproducibility
def seed_everything(seed=42):
    np.random.seed(seed)
    try: import torch; torch.manual_seed(seed)
    except: pass
    try: import tensorflow as tf; tf.random.set_seed(seed)
    except: pass

DATA_PATH = "../data/digit-recognizer/train.csv"
print(f"Using data from: {os.path.abspath(DATA_PATH)}")
data = pd.read_csv(DATA_PATH)
SEED = 42
EPOCHS = 10
TRAIN_BATCH_SIZE = 128

# Logging CPU Info
import multiprocessing
print(f"CPU Cores available: {multiprocessing.cpu_count()}")

Using data from: /home/mitchellmirano/Desktop/MitchellProjects/sorix/docs/examples/data/digit-recognizer/train.csv
CPU Cores available: 32

0. Data Preparation¶

We use the same data split for all frameworks to ensure a fair comparison.

In [3]:

Copied!

from sklearn.model_selection import train_test_split

X = data.drop("label", axis=1).values.astype('float32') / 255.0
y = data["label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
from sklearn.model_selection import train_test_split

X = data.drop("label", axis=1).values.astype('float32') / 255.0
y = data["label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train size: 33600, Test size: 8400

1. Unified Device-Aware Benchmark¶

We evaluate each framework on both CPU and GPU (if available) to differentiate performance accurately.

In [4]:

Copied!





import torch
import tensorflow as tf
import cupy as cp
import time
import sorix
import numpy as np
from sorix import tensor
from sorix.nn import Module, Linear, CrossEntropyLoss, ReLU, BatchNorm1d, Dropout
from sorix.optim import RMSprop
from sorix.utils.data import Dataset, DataLoader

# Results storage
all_results = []
INFERENCE_BATCH_CPU = 4096  # Using large batch for CPU as requested
INFERENCE_BATCH_GPU = 1024

def run_sorix(device_name='cpu'):
    print(f"--- Running Sorix on {device_name} ---")
    seed_everything(SEED)
    
    class SorixModel(Module):
        def __init__(self):
            super().__init__()
            self.linear1 = Linear(784, 128, bias=False)
            self.bn1 = BatchNorm1d(128)
            self.linear2 = Linear(128, 64)
            self.linear3 = Linear(64, 10)
            self.relu = ReLU()
            self.dropout = Dropout(p=0.2)
        def forward(self, x):
            x = self.linear1(x); x = self.bn1(x); x = self.relu(x)
            x = self.linear2(x); x = self.relu(x)
            x = self.dropout(x); x = self.linear3(x)
            return x

    model = SorixModel().to(device_name)
    loss_fn = CrossEntropyLoss()
    optimizer = RMSprop(model.parameters(), lr=1e-3, alpha=0.99)
    train_ds = Dataset(X_train, y_train.reshape(-1, 1))
    train_loader = DataLoader(train_ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

    # 1. Training Time
    start_train = time.time()
    for epoch in range(EPOCHS):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device_name), yb.to(device_name)
            optimizer.zero_grad()
            logits = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            optimizer.step()
    train_time = time.time() - start_train

    # 2. Inference Time
    model.eval()
    batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
    inf_loader = DataLoader(Dataset(X_test, y_test.reshape(-1, 1)), batch_size=batch_size)
    
    # Warmup
    dummy = tensor(X_test[:100]).to(device_name)
    with sorix.no_grad(): _ = model(dummy)
    
    start_inf = time.time()
    with sorix.no_grad():
        for xb, _ in inf_loader:
            xb = xb.to(device_name)
            _ = model(xb)
    inf_time = time.time() - start_inf

    # Accuracy check
    tx = tensor(X_test[:1000]).to(device_name)
    with sorix.no_grad():
        out = model(tx)
        preds = sorix.argmax(out, axis=1, keepdims=True)
    acc = (preds.cpu().data.flatten() == y_test[:1000]).mean()
    
    all_results.append({
        'Framework': 'Sorix', 
        'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU', 
        'train_batch_size':TRAIN_BATCH_SIZE,
        'Train Time': train_time, 
        'test_batch_size':batch_size,
        'Inference Time': inf_time,
        'Accuracy': acc
    })
    return model

def run_pytorch(device_name='cpu'):
    print(f"--- Running PyTorch on {device_name} ---")
    pt_device = torch.device(device_name)
    seed_everything(SEED)
    
    class PyTorchModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.net = torch.nn.Sequential(
                torch.nn.Linear(784, 128, bias=False),
                torch.nn.BatchNorm1d(128),
                torch.nn.ReLU(),
                torch.nn.Linear(128, 64),
                torch.nn.ReLU(),
                torch.nn.Dropout(0.2),
                torch.nn.Linear(64, 10)
            )
        def forward(self, x): return self.net(x)

    model = PyTorchModel().to(pt_device)
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, alpha=0.99)
    ds = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train).long())
    loader = torch.utils.data.DataLoader(ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

    # 1. Training Time
    start_train = time.time()
    for epoch in range(EPOCHS):
        model.train()
        for xb, yb in loader:
            xb, yb = xb.to(pt_device), yb.to(pt_device)
            optimizer.zero_grad()
            loss = loss_fn(model(xb), yb)
            loss.backward()
            optimizer.step()
    train_time = time.time() - start_train

    # 2. Inference Time
    model.eval()
    batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
    inf_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(torch.from_numpy(X_test)), 
        batch_size=batch_size
    )
    
    # Warmup
    dummy = torch.from_numpy(X_test[:100]).to(pt_device)
    with torch.no_grad(): _ = model(dummy)
    
    start_inf = time.time()
    with torch.no_grad():
        for xb, in inf_loader:
            xb = xb.to(pt_device)
            _ = model(xb)
    inf_time = time.time() - start_inf

    # Accuracy
    model.eval()
    tx = torch.from_numpy(X_test[:1000]).to(pt_device)
    with torch.no_grad():
        acc = (model(tx).argmax(1).cpu().numpy() == y_test[:1000]).mean()
    
    all_results.append({
        'Framework': 'PyTorch', 
        'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU', 
        'train_batch_size':TRAIN_BATCH_SIZE,
        'Train Time': train_time, 
        'test_batch_size':batch_size,
        'Inference Time': inf_time,
        'Accuracy': acc
    })
    return model

def run_tensorflow(device_name='cpu'):
    tf_dev = f"/{device_name.upper()}:0"
    print(f"--- Running TensorFlow on {tf_dev} ---")
    seed_everything(SEED)
    
    with tf.device(tf_dev):
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(128, input_shape=(784,), use_bias=False),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(64),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10)
        ])
        model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-3, rho=0.99),
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
        
        # 1. Training Time
        start_train = time.time()
        model.fit(X_train, y_train, batch_size=TRAIN_BATCH_SIZE, epochs=EPOCHS, verbose=0)
        train_time = time.time() - start_train
        
        # 2. Inference Time
        batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
        # Warmup
        _ = model.predict(X_test[:100], verbose=0)
        
        start_inf = time.time()
        _ = model.predict(X_test, batch_size=batch_size, verbose=0)
        inf_time = time.time() - start_inf
        
        preds = model.predict(X_test[:1000], verbose=0).argmax(axis=1)
        acc = (preds == y_test[:1000]).mean()
    
    all_results.append({
        'Framework': 'TensorFlow', 
        'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU', 
        'train_batch_size':TRAIN_BATCH_SIZE,
        'Train Time': train_time, 
        'test_batch_size':batch_size,
        'Inference Time': inf_time,
        'Accuracy': acc
    })
    return model

# Run all
sorix_model = run_sorix('cpu')
pytorch_model = run_pytorch('cpu')
tensorflow_model = run_tensorflow('cpu')

if sorix.cuda.is_available():
    sorix_model_gpu = run_sorix('cuda')
    pytorch_model_gpu = run_pytorch('cuda')
    tensorflow_model_gpu = run_tensorflow('gpu')

df_results = pd.DataFrame(all_results)
display(df_results)

import torch
import tensorflow as tf
import cupy as cp
import time
import sorix
import numpy as np
from sorix import tensor
from sorix.nn import Module, Linear, CrossEntropyLoss, ReLU, BatchNorm1d, Dropout
from sorix.optim import RMSprop
from sorix.utils.data import Dataset, DataLoader

# Results storage
all_results = []
INFERENCE_BATCH_CPU = 4096  # Using large batch for CPU as requested
INFERENCE_BATCH_GPU = 1024

def run_sorix(device_name='cpu'):
    print(f"--- Running Sorix on {device_name} ---")
    seed_everything(SEED)
    
    class SorixModel(Module):
        def __init__(self):
            super().__init__()
            self.linear1 = Linear(784, 128, bias=False)
            self.bn1 = BatchNorm1d(128)
            self.linear2 = Linear(128, 64)
            self.linear3 = Linear(64, 10)
            self.relu = ReLU()
            self.dropout = Dropout(p=0.2)
        def forward(self, x):
            x = self.linear1(x); x = self.bn1(x); x = self.relu(x)
            x = self.linear2(x); x = self.relu(x)
            x = self.dropout(x); x = self.linear3(x)
            return x

    model = SorixModel().to(device_name)
    loss_fn = CrossEntropyLoss()
    optimizer = RMSprop(model.parameters(), lr=1e-3, alpha=0.99)
    train_ds = Dataset(X_train, y_train.reshape(-1, 1))
    train_loader = DataLoader(train_ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

    # 1. Training Time
    start_train = time.time()
    for epoch in range(EPOCHS):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device_name), yb.to(device_name)
            optimizer.zero_grad()
            logits = model(xb)
            loss = loss_fn(logits, yb)
            loss.backward()
            optimizer.step()
    train_time = time.time() - start_train

    # 2. Inference Time
    model.eval()
    batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
    inf_loader = DataLoader(Dataset(X_test, y_test.reshape(-1, 1)), batch_size=batch_size)
    
    # Warmup
    dummy = tensor(X_test[:100]).to(device_name)
    with sorix.no_grad(): _ = model(dummy)
    
    start_inf = time.time()
    with sorix.no_grad():
        for xb, _ in inf_loader:
            xb = xb.to(device_name)
            _ = model(xb)
    inf_time = time.time() - start_inf

    # Accuracy check
    tx = tensor(X_test[:1000]).to(device_name)
    with sorix.no_grad():
        out = model(tx)
        preds = sorix.argmax(out, axis=1, keepdims=True)
    acc = (preds.cpu().data.flatten() == y_test[:1000]).mean()
    
    all_results.append({
        'Framework': 'Sorix', 
        'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU', 
        'train_batch_size':TRAIN_BATCH_SIZE,
        'Train Time': train_time, 
        'test_batch_size':batch_size,
        'Inference Time': inf_time,
        'Accuracy': acc
    })
    return model

def run_pytorch(device_name='cpu'):
    print(f"--- Running PyTorch on {device_name} ---")
    pt_device = torch.device(device_name)
    seed_everything(SEED)
    
    class PyTorchModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            self.net = torch.nn.Sequential(
                torch.nn.Linear(784, 128, bias=False),
                torch.nn.BatchNorm1d(128),
                torch.nn.ReLU(),
                torch.nn.Linear(128, 64),
                torch.nn.ReLU(),
                torch.nn.Dropout(0.2),
                torch.nn.Linear(64, 10)
            )
        def forward(self, x): return self.net(x)

    model = PyTorchModel().to(pt_device)
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, alpha=0.99)
    ds = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train).long())
    loader = torch.utils.data.DataLoader(ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

    # 1. Training Time
    start_train = time.time()
    for epoch in range(EPOCHS):
        model.train()
        for xb, yb in loader:
            xb, yb = xb.to(pt_device), yb.to(pt_device)
            optimizer.zero_grad()
            loss = loss_fn(model(xb), yb)
            loss.backward()
            optimizer.step()
    train_time = time.time() - start_train

    # 2. Inference Time
    model.eval()
    batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
    inf_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(torch.from_numpy(X_test)), 
        batch_size=batch_size
    )
    
    # Warmup
    dummy = torch.from_numpy(X_test[:100]).to(pt_device)
    with torch.no_grad(): _ = model(dummy)
    
    start_inf = time.time()
    with torch.no_grad():
        for xb, in inf_loader:
            xb = xb.to(pt_device)
            _ = model(xb)
    inf_time = time.time() - start_inf

    # Accuracy
    model.eval()
    tx = torch.from_numpy(X_test[:1000]).to(pt_device)
    with torch.no_grad():
        acc = (model(tx).argmax(1).cpu().numpy() == y_test[:1000]).mean()
    
    all_results.append({
        'Framework': 'PyTorch', 
        'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU', 
        'train_batch_size':TRAIN_BATCH_SIZE,
        'Train Time': train_time, 
        'test_batch_size':batch_size,
        'Inference Time': inf_time,
        'Accuracy': acc
    })
    return model

def run_tensorflow(device_name='cpu'):
    tf_dev = f"/{device_name.upper()}:0"
    print(f"--- Running TensorFlow on {tf_dev} ---")
    seed_everything(SEED)
    
    with tf.device(tf_dev):
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(128, input_shape=(784,), use_bias=False),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(64),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10)
        ])
        model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-3, rho=0.99),
                      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
        
        # 1. Training Time
        start_train = time.time()
        model.fit(X_train, y_train, batch_size=TRAIN_BATCH_SIZE, epochs=EPOCHS, verbose=0)
        train_time = time.time() - start_train
        
        # 2. Inference Time
        batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
        # Warmup
        _ = model.predict(X_test[:100], verbose=0)
        
        start_inf = time.time()
        _ = model.predict(X_test, batch_size=batch_size, verbose=0)
        inf_time = time.time() - start_inf
        
        preds = model.predict(X_test[:1000], verbose=0).argmax(axis=1)
        acc = (preds == y_test[:1000]).mean()
    
    all_results.append({
        'Framework': 'TensorFlow', 
        'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU', 
        'train_batch_size':TRAIN_BATCH_SIZE,
        'Train Time': train_time, 
        'test_batch_size':batch_size,
        'Inference Time': inf_time,
        'Accuracy': acc
    })
    return model

# Run all
sorix_model = run_sorix('cpu')
pytorch_model = run_pytorch('cpu')
tensorflow_model = run_tensorflow('cpu')

if sorix.cuda.is_available():
    sorix_model_gpu = run_sorix('cuda')
    pytorch_model_gpu = run_pytorch('cuda')
    tensorflow_model_gpu = run_tensorflow('gpu')

df_results = pd.DataFrame(all_results)
display(df_results)

WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1774892960.012869  110092 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
I0000 00:00:1774892960.042410  110092 cpu_feature_guard.cc:227] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
I0000 00:00:1774892960.736748  110092 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.

--- Running Sorix on cpu ---
--- Running PyTorch on cpu ---
--- Running TensorFlow on /CPU:0 ---

I0000 00:00:1774892978.452214  110092 gpu_device.cc:2043] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6074 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
/home/mitchellmirano/Desktop/MitchellProjects/sorix/.venv/lib/python3.13/site-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1774892978.891980  110258 service.cc:153] XLA service 0x7f2af8016b30 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1774892978.892028  110258 service.cc:161]   StreamExecutor [0]: Host, Default Version (Driver: 0.0.0; Runtime: 0.0.0; Toolkit: 0.0.0; DNN: 0.0.0)
I0000 00:00:1774892978.917816  110258 dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1774892979.198704  110258 device_compiler.h:208] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

✅ GPU basic operation passed
✅ GPU available: NVIDIA GeForce RTX 4070 Laptop GPU
CUDA runtime version: 13000
CuPy version: 13.6.0
--- Running Sorix on cuda ---
--- Running PyTorch on cuda ---
--- Running TensorFlow on /GPU:0 ---

I0000 00:00:1774893007.567461 110261 service.cc:153] XLA service 0x7f2ae00301f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1774893007.567481 110261 service.cc:161] StreamExecutor [0]: NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9 (Driver: 13.1.0; Runtime: 12.8.0; Toolkit: 12.5.0; DNN: 9.10.2)
I0000 00:00:1774893007.657842 110261 cuda_dnn.cc:461] Loaded cuDNN version 91002
I0000 00:00:1774893007.681040 110261 dot_merger.cc:481] Merging Dots in computation: a_inference_one_step_on_data_10677__.14
I0000 00:00:1774893007.708346 110261 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
I0000 00:00:1774893008.032738 111416 subprocess_compilation.cc:348] ptxas warning : Registers are spilled to local memory in function 'gemm_fusion_MatMul_1_4', 32 bytes spill stores, 32 bytes spill loads

I0000 00:00:1774893008.042438 110261 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
I0000 00:00:1774893008.589964 110261 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
I0000 00:00:1774893008.866308 111414 subprocess_compilation.cc:348] ptxas warning : Registers are spilled to local memory in function 'gemm_fusion_MatMul_20', 12 bytes spill stores, 12 bytes spill loads

I0000 00:00:1774893010.795445 110264 dot_merger.cc:481] Merging Dots in computation: a_inference_one_step_on_data_10677__.14
I0000 00:00:1774893010.820588 110264 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
I0000 00:00:1774893011.210513 110264 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
I0000 00:00:1774893011.711918 110264 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
I0000 00:00:1774893011.955275 111518 subprocess_compilation.cc:348] ptxas warning : Registers are spilled to local memory in function 'gemm_fusion_MatMul_20', 12 bytes spill stores, 12 bytes spill loads

I0000 00:00:1774893016.229980 110254 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
I0000 00:00:1774893016.557614 112306 subprocess_compilation.cc:348] ptxas warning : Registers are spilled to local memory in function 'gemm_fusion_MatMul_8', 16 bytes spill stores, 16 bytes spill loads

I0000 00:00:1774893018.147145 110257 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
I0000 00:00:1774893018.959095 110258 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.

	Framework	Device	train_batch_size	Train Time	test_batch_size	Inference Time	Accuracy
0	Sorix	CPU	128	7.362622	4096	0.031955	0.972
1	PyTorch	CPU	128	9.213377	4096	0.048254	0.974
2	TensorFlow	CPU	128	17.142942	4096	0.186286	0.968
3	Sorix	GPU	128	6.205791	1024	0.015433	0.976
4	PyTorch	GPU	128	4.037397	1024	0.025247	0.976
5	TensorFlow	GPU	128	8.978923	1024	1.702561	0.975

2. Model Export and Size Comparison¶

Size is measured using CPU-exported state dicts.

In [5]:

Copied!





import pickle
# We use the CPU models for size comparison to be fair
sorix_path = "model_sorix.sor"
with open(sorix_path, 'wb') as f: pickle.dump(sorix_model.state_dict(), f)
sorix_size = os.path.getsize(sorix_path) / 1024

pytorch_path = "model_pytorch.pt"
torch.save(pytorch_model.state_dict(), pytorch_path)
pytorch_size = os.path.getsize(pytorch_path) / 1024

tensorflow_path = "model_tf.keras"
tensorflow_model.save(tensorflow_path) # TF saves full model usually
tensorflow_size = os.path.getsize(tensorflow_path) / 1024

model_sizes = pd.DataFrame({
    'Framework': ['Sorix', 'PyTorch', 'TensorFlow'],
    'Size (KB)': [sorix_size, pytorch_size, tensorflow_size]
})
display(model_sizes)

import pickle
# We use the CPU models for size comparison to be fair
sorix_path = "model_sorix.sor"
with open(sorix_path, 'wb') as f: pickle.dump(sorix_model.state_dict(), f)
sorix_size = os.path.getsize(sorix_path) / 1024

pytorch_path = "model_pytorch.pt"
torch.save(pytorch_model.state_dict(), pytorch_path)
pytorch_size = os.path.getsize(pytorch_path) / 1024

tensorflow_path = "model_tf.keras"
tensorflow_model.save(tensorflow_path) # TF saves full model usually
tensorflow_size = os.path.getsize(tensorflow_path) / 1024

model_sizes = pd.DataFrame({
    'Framework': ['Sorix', 'PyTorch', 'TensorFlow'],
    'Size (KB)': [sorix_size, pytorch_size, tensorflow_size]
})
display(model_sizes)

	Framework	Size (KB)
0	Sorix	429.559570
1	PyTorch	432.911133
2	TensorFlow	890.750000