MNIST Benchmark Parity: Sorix vs. TensorFlow vs. PyTorch¶
This notebook compares the performance of Sorix with industry-leading frameworks like PyTorch and TensorFlow using the MNIST (Digit Recognizer) dataset. We will measure:
- Training Time (Total across multiple epochs).
- Accuracy on the test set.
- Inference Speed (Average time per batch).
Experimental Setup¶
Personal Workstation Specs: Intel Core i9 (32 cores), 64GB RAM (This justifies the extremely fast CPU computation).
Architecture: 3-layer MLP with BatchNorm and Dropout.
Optimizer: RMSprop (lr=1e-3, alpha/rho=0.99).
Loss: CrossEntropy.
Hardware: Comparisons explicitly performed on both CPU and GPU.
In [1]:
Copied!
# Uncomment the next line and run this cell to install sorix
#!pip install 'sorix @ git+https://github.com/Mitchell-Mirano/sorix.git@main'
# Uncomment the next line and run this cell to install sorix
#!pip install 'sorix @ git+https://github.com/Mitchell-Mirano/sorix.git@main'
In [6]:
Copied!
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
# Set seeds for reproducibility
def seed_everything(seed=42):
np.random.seed(seed)
try: import torch; torch.manual_seed(seed)
except: pass
try: import tensorflow as tf; tf.random.set_seed(seed)
except: pass
DATA_PATH = "../data/digit-recognizer/train.csv"
print(f"Using data from: {os.path.abspath(DATA_PATH)}")
data = pd.read_csv(DATA_PATH)
SEED = 42
EPOCHS = 10
TRAIN_BATCH_SIZE = 128
# Logging CPU Info
import multiprocessing
print(f"CPU Cores available: {multiprocessing.cpu_count()}")
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
# Set seeds for reproducibility
def seed_everything(seed=42):
np.random.seed(seed)
try: import torch; torch.manual_seed(seed)
except: pass
try: import tensorflow as tf; tf.random.set_seed(seed)
except: pass
DATA_PATH = "../data/digit-recognizer/train.csv"
print(f"Using data from: {os.path.abspath(DATA_PATH)}")
data = pd.read_csv(DATA_PATH)
SEED = 42
EPOCHS = 10
TRAIN_BATCH_SIZE = 128
# Logging CPU Info
import multiprocessing
print(f"CPU Cores available: {multiprocessing.cpu_count()}")
Using data from: /home/mitchellmirano/Desktop/MitchellProjects/sorix/docs/examples/data/digit-recognizer/train.csv CPU Cores available: 32
0. Data Preparation¶
We use the same data split for all frameworks to ensure a fair comparison.
In [3]:
Copied!
from sklearn.model_selection import train_test_split
X = data.drop("label", axis=1).values.astype('float32') / 255.0
y = data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
from sklearn.model_selection import train_test_split
X = data.drop("label", axis=1).values.astype('float32') / 255.0
y = data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
Train size: 33600, Test size: 8400
1. Unified Device-Aware Benchmark¶
We evaluate each framework on both CPU and GPU (if available) to differentiate performance accurately.
In [4]:
Copied!
import torch
import tensorflow as tf
import cupy as cp
import time
import sorix
import numpy as np
from sorix import tensor
from sorix.nn import Module, Linear, CrossEntropyLoss, ReLU, BatchNorm1d, Dropout
from sorix.optim import RMSprop
from sorix.utils.data import Dataset, DataLoader
# Results storage
all_results = []
INFERENCE_BATCH_CPU = 4096 # Using large batch for CPU as requested
INFERENCE_BATCH_GPU = 1024
def run_sorix(device_name='cpu'):
print(f"--- Running Sorix on {device_name} ---")
seed_everything(SEED)
class SorixModel(Module):
def __init__(self):
super().__init__()
self.linear1 = Linear(784, 128, bias=False)
self.bn1 = BatchNorm1d(128)
self.linear2 = Linear(128, 64)
self.linear3 = Linear(64, 10)
self.relu = ReLU()
self.dropout = Dropout(p=0.2)
def forward(self, x):
x = self.linear1(x); x = self.bn1(x); x = self.relu(x)
x = self.linear2(x); x = self.relu(x)
x = self.dropout(x); x = self.linear3(x)
return x
model = SorixModel().to(device_name)
loss_fn = CrossEntropyLoss()
optimizer = RMSprop(model.parameters(), lr=1e-3, alpha=0.99)
train_ds = Dataset(X_train, y_train.reshape(-1, 1))
train_loader = DataLoader(train_ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
# 1. Training Time
start_train = time.time()
for epoch in range(EPOCHS):
model.train()
for xb, yb in train_loader:
xb, yb = xb.to(device_name), yb.to(device_name)
optimizer.zero_grad()
logits = model(xb)
loss = loss_fn(logits, yb)
loss.backward()
optimizer.step()
train_time = time.time() - start_train
# 2. Inference Time
model.eval()
batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
inf_loader = DataLoader(Dataset(X_test, y_test.reshape(-1, 1)), batch_size=batch_size)
# Warmup
dummy = tensor(X_test[:100]).to(device_name)
with sorix.no_grad(): _ = model(dummy)
start_inf = time.time()
with sorix.no_grad():
for xb, _ in inf_loader:
xb = xb.to(device_name)
_ = model(xb)
inf_time = time.time() - start_inf
# Accuracy check
tx = tensor(X_test[:1000]).to(device_name)
with sorix.no_grad():
out = model(tx)
preds = sorix.argmax(out, axis=1, keepdims=True)
acc = (preds.cpu().data.flatten() == y_test[:1000]).mean()
all_results.append({
'Framework': 'Sorix',
'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU',
'train_batch_size':TRAIN_BATCH_SIZE,
'Train Time': train_time,
'test_batch_size':batch_size,
'Inference Time': inf_time,
'Accuracy': acc
})
return model
def run_pytorch(device_name='cpu'):
print(f"--- Running PyTorch on {device_name} ---")
pt_device = torch.device(device_name)
seed_everything(SEED)
class PyTorchModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.net = torch.nn.Sequential(
torch.nn.Linear(784, 128, bias=False),
torch.nn.BatchNorm1d(128),
torch.nn.ReLU(),
torch.nn.Linear(128, 64),
torch.nn.ReLU(),
torch.nn.Dropout(0.2),
torch.nn.Linear(64, 10)
)
def forward(self, x): return self.net(x)
model = PyTorchModel().to(pt_device)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, alpha=0.99)
ds = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train).long())
loader = torch.utils.data.DataLoader(ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
# 1. Training Time
start_train = time.time()
for epoch in range(EPOCHS):
model.train()
for xb, yb in loader:
xb, yb = xb.to(pt_device), yb.to(pt_device)
optimizer.zero_grad()
loss = loss_fn(model(xb), yb)
loss.backward()
optimizer.step()
train_time = time.time() - start_train
# 2. Inference Time
model.eval()
batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
inf_loader = torch.utils.data.DataLoader(
torch.utils.data.TensorDataset(torch.from_numpy(X_test)),
batch_size=batch_size
)
# Warmup
dummy = torch.from_numpy(X_test[:100]).to(pt_device)
with torch.no_grad(): _ = model(dummy)
start_inf = time.time()
with torch.no_grad():
for xb, in inf_loader:
xb = xb.to(pt_device)
_ = model(xb)
inf_time = time.time() - start_inf
# Accuracy
model.eval()
tx = torch.from_numpy(X_test[:1000]).to(pt_device)
with torch.no_grad():
acc = (model(tx).argmax(1).cpu().numpy() == y_test[:1000]).mean()
all_results.append({
'Framework': 'PyTorch',
'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU',
'train_batch_size':TRAIN_BATCH_SIZE,
'Train Time': train_time,
'test_batch_size':batch_size,
'Inference Time': inf_time,
'Accuracy': acc
})
return model
def run_tensorflow(device_name='cpu'):
tf_dev = f"/{device_name.upper()}:0"
print(f"--- Running TensorFlow on {tf_dev} ---")
seed_everything(SEED)
with tf.device(tf_dev):
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(128, input_shape=(784,), use_bias=False),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.ReLU(),
tf.keras.layers.Dense(64),
tf.keras.layers.ReLU(),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10)
])
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-3, rho=0.99),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
# 1. Training Time
start_train = time.time()
model.fit(X_train, y_train, batch_size=TRAIN_BATCH_SIZE, epochs=EPOCHS, verbose=0)
train_time = time.time() - start_train
# 2. Inference Time
batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
# Warmup
_ = model.predict(X_test[:100], verbose=0)
start_inf = time.time()
_ = model.predict(X_test, batch_size=batch_size, verbose=0)
inf_time = time.time() - start_inf
preds = model.predict(X_test[:1000], verbose=0).argmax(axis=1)
acc = (preds == y_test[:1000]).mean()
all_results.append({
'Framework': 'TensorFlow',
'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU',
'train_batch_size':TRAIN_BATCH_SIZE,
'Train Time': train_time,
'test_batch_size':batch_size,
'Inference Time': inf_time,
'Accuracy': acc
})
return model
# Run all
sorix_model = run_sorix('cpu')
pytorch_model = run_pytorch('cpu')
tensorflow_model = run_tensorflow('cpu')
if sorix.cuda.is_available():
sorix_model_gpu = run_sorix('cuda')
pytorch_model_gpu = run_pytorch('cuda')
tensorflow_model_gpu = run_tensorflow('gpu')
df_results = pd.DataFrame(all_results)
display(df_results)
import torch
import tensorflow as tf
import cupy as cp
import time
import sorix
import numpy as np
from sorix import tensor
from sorix.nn import Module, Linear, CrossEntropyLoss, ReLU, BatchNorm1d, Dropout
from sorix.optim import RMSprop
from sorix.utils.data import Dataset, DataLoader
# Results storage
all_results = []
INFERENCE_BATCH_CPU = 4096 # Using large batch for CPU as requested
INFERENCE_BATCH_GPU = 1024
def run_sorix(device_name='cpu'):
print(f"--- Running Sorix on {device_name} ---")
seed_everything(SEED)
class SorixModel(Module):
def __init__(self):
super().__init__()
self.linear1 = Linear(784, 128, bias=False)
self.bn1 = BatchNorm1d(128)
self.linear2 = Linear(128, 64)
self.linear3 = Linear(64, 10)
self.relu = ReLU()
self.dropout = Dropout(p=0.2)
def forward(self, x):
x = self.linear1(x); x = self.bn1(x); x = self.relu(x)
x = self.linear2(x); x = self.relu(x)
x = self.dropout(x); x = self.linear3(x)
return x
model = SorixModel().to(device_name)
loss_fn = CrossEntropyLoss()
optimizer = RMSprop(model.parameters(), lr=1e-3, alpha=0.99)
train_ds = Dataset(X_train, y_train.reshape(-1, 1))
train_loader = DataLoader(train_ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
# 1. Training Time
start_train = time.time()
for epoch in range(EPOCHS):
model.train()
for xb, yb in train_loader:
xb, yb = xb.to(device_name), yb.to(device_name)
optimizer.zero_grad()
logits = model(xb)
loss = loss_fn(logits, yb)
loss.backward()
optimizer.step()
train_time = time.time() - start_train
# 2. Inference Time
model.eval()
batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
inf_loader = DataLoader(Dataset(X_test, y_test.reshape(-1, 1)), batch_size=batch_size)
# Warmup
dummy = tensor(X_test[:100]).to(device_name)
with sorix.no_grad(): _ = model(dummy)
start_inf = time.time()
with sorix.no_grad():
for xb, _ in inf_loader:
xb = xb.to(device_name)
_ = model(xb)
inf_time = time.time() - start_inf
# Accuracy check
tx = tensor(X_test[:1000]).to(device_name)
with sorix.no_grad():
out = model(tx)
preds = sorix.argmax(out, axis=1, keepdims=True)
acc = (preds.cpu().data.flatten() == y_test[:1000]).mean()
all_results.append({
'Framework': 'Sorix',
'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU',
'train_batch_size':TRAIN_BATCH_SIZE,
'Train Time': train_time,
'test_batch_size':batch_size,
'Inference Time': inf_time,
'Accuracy': acc
})
return model
def run_pytorch(device_name='cpu'):
print(f"--- Running PyTorch on {device_name} ---")
pt_device = torch.device(device_name)
seed_everything(SEED)
class PyTorchModel(torch.nn.Module):
def __init__(self):
super().__init__()
self.net = torch.nn.Sequential(
torch.nn.Linear(784, 128, bias=False),
torch.nn.BatchNorm1d(128),
torch.nn.ReLU(),
torch.nn.Linear(128, 64),
torch.nn.ReLU(),
torch.nn.Dropout(0.2),
torch.nn.Linear(64, 10)
)
def forward(self, x): return self.net(x)
model = PyTorchModel().to(pt_device)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-3, alpha=0.99)
ds = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train).long())
loader = torch.utils.data.DataLoader(ds, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
# 1. Training Time
start_train = time.time()
for epoch in range(EPOCHS):
model.train()
for xb, yb in loader:
xb, yb = xb.to(pt_device), yb.to(pt_device)
optimizer.zero_grad()
loss = loss_fn(model(xb), yb)
loss.backward()
optimizer.step()
train_time = time.time() - start_train
# 2. Inference Time
model.eval()
batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
inf_loader = torch.utils.data.DataLoader(
torch.utils.data.TensorDataset(torch.from_numpy(X_test)),
batch_size=batch_size
)
# Warmup
dummy = torch.from_numpy(X_test[:100]).to(pt_device)
with torch.no_grad(): _ = model(dummy)
start_inf = time.time()
with torch.no_grad():
for xb, in inf_loader:
xb = xb.to(pt_device)
_ = model(xb)
inf_time = time.time() - start_inf
# Accuracy
model.eval()
tx = torch.from_numpy(X_test[:1000]).to(pt_device)
with torch.no_grad():
acc = (model(tx).argmax(1).cpu().numpy() == y_test[:1000]).mean()
all_results.append({
'Framework': 'PyTorch',
'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU',
'train_batch_size':TRAIN_BATCH_SIZE,
'Train Time': train_time,
'test_batch_size':batch_size,
'Inference Time': inf_time,
'Accuracy': acc
})
return model
def run_tensorflow(device_name='cpu'):
tf_dev = f"/{device_name.upper()}:0"
print(f"--- Running TensorFlow on {tf_dev} ---")
seed_everything(SEED)
with tf.device(tf_dev):
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(128, input_shape=(784,), use_bias=False),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.ReLU(),
tf.keras.layers.Dense(64),
tf.keras.layers.ReLU(),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10)
])
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-3, rho=0.99),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
# 1. Training Time
start_train = time.time()
model.fit(X_train, y_train, batch_size=TRAIN_BATCH_SIZE, epochs=EPOCHS, verbose=0)
train_time = time.time() - start_train
# 2. Inference Time
batch_size = INFERENCE_BATCH_CPU if device_name == 'cpu' else INFERENCE_BATCH_GPU
# Warmup
_ = model.predict(X_test[:100], verbose=0)
start_inf = time.time()
_ = model.predict(X_test, batch_size=batch_size, verbose=0)
inf_time = time.time() - start_inf
preds = model.predict(X_test[:1000], verbose=0).argmax(axis=1)
acc = (preds == y_test[:1000]).mean()
all_results.append({
'Framework': 'TensorFlow',
'Device': 'GPU' if device_name.lower() in ['cuda', 'gpu'] else 'CPU',
'train_batch_size':TRAIN_BATCH_SIZE,
'Train Time': train_time,
'test_batch_size':batch_size,
'Inference Time': inf_time,
'Accuracy': acc
})
return model
# Run all
sorix_model = run_sorix('cpu')
pytorch_model = run_pytorch('cpu')
tensorflow_model = run_tensorflow('cpu')
if sorix.cuda.is_available():
sorix_model_gpu = run_sorix('cuda')
pytorch_model_gpu = run_pytorch('cuda')
tensorflow_model_gpu = run_tensorflow('gpu')
df_results = pd.DataFrame(all_results)
display(df_results)
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR I0000 00:00:1774892960.012869 110092 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. I0000 00:00:1774892960.042410 110092 cpu_feature_guard.cc:227] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. WARNING: All log messages before absl::InitializeLog() is called are written to STDERR I0000 00:00:1774892960.736748 110092 port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
--- Running Sorix on cpu --- --- Running PyTorch on cpu --- --- Running TensorFlow on /CPU:0 ---
I0000 00:00:1774892978.452214 110092 gpu_device.cc:2043] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6074 MB memory: -> device: 0, name: NVIDIA GeForce RTX 4070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9 /home/mitchellmirano/Desktop/MitchellProjects/sorix/.venv/lib/python3.13/site-packages/keras/src/layers/core/dense.py:106: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs) I0000 00:00:1774892978.891980 110258 service.cc:153] XLA service 0x7f2af8016b30 initialized for platform Host (this does not guarantee that XLA will be used). Devices: I0000 00:00:1774892978.892028 110258 service.cc:161] StreamExecutor [0]: Host, Default Version (Driver: 0.0.0; Runtime: 0.0.0; Toolkit: 0.0.0; DNN: 0.0.0) I0000 00:00:1774892978.917816 110258 dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable. I0000 00:00:1774892979.198704 110258 device_compiler.h:208] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.
✅ GPU basic operation passed ✅ GPU available: NVIDIA GeForce RTX 4070 Laptop GPU CUDA runtime version: 13000 CuPy version: 13.6.0 --- Running Sorix on cuda --- --- Running PyTorch on cuda --- --- Running TensorFlow on /GPU:0 ---
I0000 00:00:1774893007.567461 110261 service.cc:153] XLA service 0x7f2ae00301f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices: I0000 00:00:1774893007.567481 110261 service.cc:161] StreamExecutor [0]: NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9 (Driver: 13.1.0; Runtime: 12.8.0; Toolkit: 12.5.0; DNN: 9.10.2) I0000 00:00:1774893007.657842 110261 cuda_dnn.cc:461] Loaded cuDNN version 91002 I0000 00:00:1774893007.681040 110261 dot_merger.cc:481] Merging Dots in computation: a_inference_one_step_on_data_10677__.14 I0000 00:00:1774893007.708346 110261 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead. I0000 00:00:1774893008.032738 111416 subprocess_compilation.cc:348] ptxas warning : Registers are spilled to local memory in function 'gemm_fusion_MatMul_1_4', 32 bytes spill stores, 32 bytes spill loads I0000 00:00:1774893008.042438 110261 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead. I0000 00:00:1774893008.589964 110261 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead. I0000 00:00:1774893008.866308 111414 subprocess_compilation.cc:348] ptxas warning : Registers are spilled to local memory in function 'gemm_fusion_MatMul_20', 12 bytes spill stores, 12 bytes spill loads I0000 00:00:1774893010.795445 110264 dot_merger.cc:481] Merging Dots in computation: a_inference_one_step_on_data_10677__.14 I0000 00:00:1774893010.820588 110264 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead. I0000 00:00:1774893011.210513 110264 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead. I0000 00:00:1774893011.711918 110264 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead. I0000 00:00:1774893011.955275 111518 subprocess_compilation.cc:348] ptxas warning : Registers are spilled to local memory in function 'gemm_fusion_MatMul_20', 12 bytes spill stores, 12 bytes spill loads I0000 00:00:1774893016.229980 110254 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead. I0000 00:00:1774893016.557614 112306 subprocess_compilation.cc:348] ptxas warning : Registers are spilled to local memory in function 'gemm_fusion_MatMul_8', 16 bytes spill stores, 16 bytes spill loads I0000 00:00:1774893018.147145 110257 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead. I0000 00:00:1774893018.959095 110258 dot_search_space.cc:240] All configs were filtered out because none of them sufficiently match the hints. Maybe the hints set does not contain a good representative set of valid configs? Working around this by using the full hints set instead.
| Framework | Device | train_batch_size | Train Time | test_batch_size | Inference Time | Accuracy | |
|---|---|---|---|---|---|---|---|
| 0 | Sorix | CPU | 128 | 7.362622 | 4096 | 0.031955 | 0.972 |
| 1 | PyTorch | CPU | 128 | 9.213377 | 4096 | 0.048254 | 0.974 |
| 2 | TensorFlow | CPU | 128 | 17.142942 | 4096 | 0.186286 | 0.968 |
| 3 | Sorix | GPU | 128 | 6.205791 | 1024 | 0.015433 | 0.976 |
| 4 | PyTorch | GPU | 128 | 4.037397 | 1024 | 0.025247 | 0.976 |
| 5 | TensorFlow | GPU | 128 | 8.978923 | 1024 | 1.702561 | 0.975 |
2. Model Export and Size Comparison¶
Size is measured using CPU-exported state dicts.
In [5]:
Copied!
import pickle
# We use the CPU models for size comparison to be fair
sorix_path = "model_sorix.sor"
with open(sorix_path, 'wb') as f: pickle.dump(sorix_model.state_dict(), f)
sorix_size = os.path.getsize(sorix_path) / 1024
pytorch_path = "model_pytorch.pt"
torch.save(pytorch_model.state_dict(), pytorch_path)
pytorch_size = os.path.getsize(pytorch_path) / 1024
tensorflow_path = "model_tf.keras"
tensorflow_model.save(tensorflow_path) # TF saves full model usually
tensorflow_size = os.path.getsize(tensorflow_path) / 1024
model_sizes = pd.DataFrame({
'Framework': ['Sorix', 'PyTorch', 'TensorFlow'],
'Size (KB)': [sorix_size, pytorch_size, tensorflow_size]
})
display(model_sizes)
import pickle
# We use the CPU models for size comparison to be fair
sorix_path = "model_sorix.sor"
with open(sorix_path, 'wb') as f: pickle.dump(sorix_model.state_dict(), f)
sorix_size = os.path.getsize(sorix_path) / 1024
pytorch_path = "model_pytorch.pt"
torch.save(pytorch_model.state_dict(), pytorch_path)
pytorch_size = os.path.getsize(pytorch_path) / 1024
tensorflow_path = "model_tf.keras"
tensorflow_model.save(tensorflow_path) # TF saves full model usually
tensorflow_size = os.path.getsize(tensorflow_path) / 1024
model_sizes = pd.DataFrame({
'Framework': ['Sorix', 'PyTorch', 'TensorFlow'],
'Size (KB)': [sorix_size, pytorch_size, tensorflow_size]
})
display(model_sizes)
| Framework | Size (KB) | |
|---|---|---|
| 0 | Sorix | 429.559570 |
| 1 | PyTorch | 432.911133 |
| 2 | TensorFlow | 890.750000 |