Python-based scientific computing package serving two broad purposes:
pip (preferred):pip install torch
conda (deprecated from v2.5 onwards):conda install -c pytorch pytorch
# assuming cuda-toolkit 12.4 is available
pip install --index-url https://download.pytorch.org/whl/cu124
# or if via conda
conda install -c pytorch -c nvidia pytorch pytorch-cuda=12.4
torch.tensor, torch.as_tensortorch.rand, torch.ones, torch.arange, etc.data = torch.tensor([ 1, 2, -9 ])
draw(data, torch.arange(5), torch.zeros((2, 3)))
torch.tensor vs torch.as_tensor¶data = torch.as_tensor([ 1, 2, 3 ])
data
tensor([1, 2, 3])
np_data = numpy.eye(3)
pt_data = torch.tensor(np_data)
pt_data[2, 1] = -5
np_data
array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])
shape / size (shape in numpy)pt_data.shape
torch.Size([3, 3])
dtypept_data.dtype
torch.float64
devicecpu, cuda:{#N} (GPU), mps (Apple Metal), etc.pt_data.device
device(type='cpu')
a = torch.arange(4) * -1 # (4,)
b = torch.arange(3)[:, None] # (3, 1)
draw(a, b, a + b)
X = torch.rand((4, 4))    # (B, 4)
W = torch.rand((2, 4))    # (2, 4)
b = torch.rand(2) * -1    # (1, 2)
h = X @ W.T + b # (B, 2)
draw(X, W.T, b, h)  
torch.stack, torch.catd1 = torch.arange(4)[:, None]
d2 = -1 * torch.arange(4)[:, None]
d3 = torch.cat([ d1, d2 ], dim=-1)
d3.shape
torch.Size([4, 2])
numpy() (numpy array), tolist() (python list)type(pt_data.tolist())
list
cpu. Migration: .<device>(), .to()torch.cuda.is_available()
True
pt_data = torch.rand((2, 3))
pt_data = pt_data.to('cuda')
pt_data.device
device(type='cuda', index=0)
torch.manual_seed(20240130)
pt_data1     = torch.rand(12, 4096, 768)
pt_data1_acc = pt_data1.to('cuda')
np_data1     = pt_data1.numpy()
pt_data2     = torch.rand(12, 768, 4096)
pt_data2_acc = pt_data2.to('cuda')
np_data2     = pt_data2.numpy()
%%timeit -r 4
np_data2 @ np_data1
39.9 ms ± 3.58 ms per loop (mean ± std. dev. of 4 runs, 10 loops each)
%%timeit -r 4
pt_data2 @ pt_data1
17.1 ms ± 142 μs per loop (mean ± std. dev. of 4 runs, 100 loops each)
%%timeit -r 4
pt_data2_acc @ pt_data1_acc
3.39 ms ± 410 μs per loop (mean ± std. dev. of 4 runs, 1,000 loops each)
where, view, reshape and contiguousdata = torch.randn((1, 6))
data = torch.where(data > 0, data, 0)
data2 = data.view((2, 3)).transpose(1, 0).reshape((6, 1))
data2[-1, 0] = -6
draw(data, data2)
transpose and permutept_data = torch.rand((2, 3, 5))
new_pt_data = pt_data.permute((2, 0, 1))
new_pt_data.shape
torch.Size([5, 2, 3])
gather and take# torch.manual_seed(20250201)
# data = torch.randn((2, 3)) * 10
# index = torch.tensor([ [ 1, 2 ], [ 0, 2 ] ])
# new_data = torch.gather(data, dim=0, index=data.argsort(dim=0))
# draw(data, new_data)
a = torch.rand((5, 32, 32, 3))
a = a[-1, :, :, 1, None ]
einops: Swiss army-knife for many tensor operations. Preserves semantics.# diagonal: data.diag()
torch.einsum("ii->i", data)
# transpose: data.T
torch.einsum("ij -> ji", data)
# batch matrix multiplication: torch.bmm(batch_data1, batch_data2)
torch.einsum("bik, bkj -> bij", batch_data1, batch_data2)
einops: Swiss army-knife for many tensor operations. Preserves semantics.# axis 0 sum: data.sum(dim=0)
torch.einsum("ij -> j", data)
# all sum: data.sum()
torch.einsum("ij -> ", data)
# reshape/view: x.view(x.shape[0], -1)
einops.rearrange(x, 'b c h w -> b (c h w)')
einops package: https://einops.rockstorch.autograd¶torch.autograd prepares computation graphs on the fly for backward passes.requires_grad=True!a = torch.tensor([ [1, 2.] ], requires_grad=True)
b = torch.tensor([ [3, 4.] ])
b.requires_grad = True
d = a ** 2
c = (a + b).T
e = d @ c
e = torch.where(e > 0, e, 0)
e
tensor([[28.]], grad_fn=<WhereBackward0>)
torchviz.make_dot(e)
detach() and no_grad()¶a = torch.rand((2, 2), requires_grad=True)
b = torch.rand((2, 2), requires_grad=True)
c = a + b
d = a ** 2
e = c + d
torchviz.make_dot(e)
detach(): detaches a tensor from the computation graph.c = a + b
d = (a ** 2).detach()
e = c + d
torchviz.make_dot(e)
no_grad(): prevents book-keeping for the computation graph. Useful for faster inference during evaluations.with torch.no_grad():
    c = a + b
    d = a ** 2
    e = c + d
torchviz.make_dot(e)
inference_mode(): similar to no_grad(), but disables autograd tracking altogether. Recommended for model inference during evaluations.with torch.inference_mode():
    c = a + b
    d = a ** 2
    e = c + d
e.requires_grad = True
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[48], line 5 3 d = a ** 2 4 e = c + d ----> 5 e.requires_grad = True RuntimeError: Setting requires_grad=True on inference tensor outside InferenceMode is not allowed.
def mlp_init(x_dim, y_dim):
    prng = numpy.random.default_rng(seed=20240130)
    W1 = prng.uniform(-1, 1, size=(2, x_dim))
    W2 = prng.uniform(-1, 1, size=(y_dim, 2))
    b1 = numpy.zeros((2, 1))
    b2 = numpy.zeros((y_dim, 1))
    return W1, W2, b1, b2
def mlp_forward(x, W1, W2, b1, b2):
    if len(x.shape) < 2: x = x[:, numpy.newaxis].T
    h1 = (W1 @ x.T) + b1
    h1 = numpy.maximum(h1, 0)
    return h1, ((W2 @ h1) + b2).T
def mlp_backward(x, y, h, y_pred, W1, W2, b1, b2, lr=0.01):
    if len(x.shape) < 2: x = x[:, numpy.newaxis].T
    if len(y.shape) < 2: y = y[:, numpy.newaxis]
    if len(y_pred.shape) < 2: y_pred = y_pred[:, numpy.newaxis]
    # compute gradients as per calculations
    num_pts = x.shape[0]
    grad_y_pred = (y_pred - y).T / num_pts
    grad_h = W2.T @ grad_y_pred
    grad_h[h <= 0] = 0
    grad_W1 = grad_h @ x
    grad_W2 = grad_y_pred @ h.T
    grad_b1 = numpy.sum(grad_h, axis=1, keepdims=True) 
    grad_b2 = numpy.sum(grad_y_pred, axis=1, keepdims=True)
    # update weights
    W1 = W1 - lr * grad_W1
    W2 = W2 - lr * grad_W2
    b1 = b1 - lr * grad_b1
    b2 = b2 - lr * grad_b2
    return W1, W2, b1, b2
prng = numpy.random.default_rng(seed=20240130)
X = prng.random(size=(1000, 2))
Y = (X[:, 0] - X[:, 1]).reshape(-1, 1)
losses = []
W1, W2, b1, b2 = mlp_init(X.shape[-1], Y.shape[-1])
num_epochs, batch_size = 100, 10
num_batches = len(X) // batch_size
with tqdm.tqdm(total = num_epochs * num_batches) as pbar:
    for epoch in range(num_epochs):
        pbar.set_description(f"Epoch #{epoch+1}")
        for i in range(num_batches):
            start = i * batch_size
            x_batch = X[start:start+batch_size]
            y_batch = Y[start:start+batch_size]
            h, y_pred = mlp_forward(x_batch, W1, W2, b1, b2)
            loss = 0.5 * numpy.mean((y_pred - y_batch) ** 2)
            losses.append(float(loss.squeeze()))
            W1, W2, b1, b2 = mlp_backward(x_batch, y_batch, h, y_pred, W1, W2, b1, b2)
            pbar.update(1)
            pbar.set_postfix(dict(loss=loss))
0%| | 0/10000 [00:00<?, ?it/s]
IOPub message rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_msg_rate_limit`. Current values: NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec) NotebookApp.rate_limit_window=3.0 (secs) IOPub message rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_msg_rate_limit`. Current values: NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec) NotebookApp.rate_limit_window=3.0 (secs) IOPub message rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_msg_rate_limit`. Current values: NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec) NotebookApp.rate_limit_window=3.0 (secs) IOPub message rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_msg_rate_limit`. Current values: NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec) NotebookApp.rate_limit_window=3.0 (secs) IOPub message rate exceeded. The notebook server will temporarily stop sending output to the client in order to avoid crashing it. To change this limit, set the config variable `--NotebookApp.iopub_msg_rate_limit`. Current values: NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec) NotebookApp.rate_limit_window=3.0 (secs)
pyplot.plot(losses)
[<matplotlib.lines.Line2D at 0x7149a7b0a490>]
mlp_forward(numpy.array([ 0.69, 0.42 ]), W1, W2, b1, b2)[-1].squeeze()
array(0.27104206)
def mlp_init_pt(x_dim, y_dim):
    torch.manual_seed(20240130)
    prng = numpy.random.default_rng(seed=20240130)
    W1 = prng.uniform(-1, 1, size=(2, x_dim))
    W2 = prng.uniform(-1, 1, size=(y_dim, 2))
    W1 = torch.tensor(W1, requires_grad=True)
    W2 = torch.tensor(W2, requires_grad=True)
    b1 = torch.zeros((2, 1), requires_grad=True)
    b2 = torch.zeros((y_dim, 1), requires_grad=True)
    return W1, W2, b1, b2
def mlp_forward_pt(x, W1, W2, b1, b2):
    if len(x.shape) < 2: x = x[:, None].T
    h1 = (W1 @ x.T) + b1
    h1 = torch.where(h1 > 0, h1, 0)
    return h1, ((W2 @ h1) + b2).T
def mlp_backward_pt(loss, W1, W2, b1, b2, lr=0.01):
    loss.backward()
    # update weights
    W1.data = W1 - lr * W1.grad
    W2.data = W2 - lr * W2.grad
    b1.data = b1 - lr * b1.grad
    b2.data = b2 - lr * b2.grad
    W1.grad = W2.grad = b1.grad = b2.grad = None
    return W1, W2, b1, b2
X, Y = torch.as_tensor(X), torch.as_tensor(Y)
losses = []
W1, W2, b1, b2 = mlp_init_pt(X.shape[-1], Y.shape[-1])
num_epochs, batch_size = 100, 10
num_batches = len(X) // batch_size
with tqdm.tqdm(total = num_epochs * num_batches) as pbar:
    for epoch in range(num_epochs):
        pbar.set_description(f"Epoch #{epoch+1}")
        for i in range(num_batches):
            start = i * batch_size
            x_batch = X[start:start+batch_size]
            y_batch = Y[start:start+batch_size]
            h, y_pred = mlp_forward_pt(x_batch, W1, W2, b1, b2)
            loss = 0.5 * torch.mean((y_pred - y_batch) ** 2)
            losses.append(loss.squeeze().item())
            W1, W2, b1, b2 = mlp_backward_pt(loss, W1, W2, b1, b2)
            pbar.update(1)
            pbar.set_postfix(dict(loss=loss.item()))
pyplot.plot(losses)
with torch.no_grad():
    output = mlp_forward_pt(torch.tensor([ 0.69, 0.42 ], dtype=torch.float64), W1, W2, b1, b2)[-1].squeeze()
output
torch.nn: Neural network utilties, activation functions, losses, etc.torch.optim: Optimizers for deep learning, utilizing autograd.torch.utils.data: Data processing utilities.torch.nn¶nn.Linear, nn.Embedding, nn.Conv2d, nn.LSTM, etc.nn.Dropout, nn.BatchNorm1d, etc.nn.Module, nn.Sequential, nn.ModuleList, nn.ModuleDictmodel = torch.nn.Sequential(
    torch.nn.Linear(X.shape[-1], 2),
    torch.nn.ReLU(),
    torch.nn.Linear(2, Y.shape[-1])
)
nn.ReLU / nn.functional.relu, etc.nn.CrossEntropyLoss / nn.functional.cross_entropy, etc.nn.Module for fine-grained control:class MLP(torch.nn.Module):
    def __init__(self, x_dim, y_dim):
        super().__init__()
        self.layer_1 = torch.nn.Linear(x_dim, 2)
        self.layer_1_act = torch.nn.ReLU()
        self.layer_2 = torch.nn.Linear(2, y_dim)
    def forward(self, inputs):
        hidden = self.layer_1_act(self.layer_1(inputs))
        return self.layer_2(hidden)
model = MLP(2, 1)
for name, param in model.named_parameters():
    print(name, param.requires_grad)
layer_1.weight True layer_1.bias True layer_2.weight True layer_2.bias True
torch.utils.data¶dataset = torch.utils.data.TensorDataset(torch.rand((40, 2)), torch.rand((40, 1)))
dataset[2] # x and y clubbed as intended.
(tensor([0.4854, 0.2087]), tensor([0.6260]))
dataloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True, drop_last=True)
for x_batch, y_batch in dataloader:
    print(x_batch.shape, y_batch.shape)
torch.Size([10, 2]) torch.Size([10, 1]) torch.Size([10, 2]) torch.Size([10, 1]) torch.Size([10, 2]) torch.Size([10, 1]) torch.Size([10, 2]) torch.Size([10, 1])
torch.optim¶torch.optim.Adam, torch.optim.SGD, etc.model = MLP(X.shape[-1], Y.shape[-1])
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# Sync gradients (optim specific W.data = W.data - eta * W.grad)
optimizer.step()
# Clear synced gradients from parameter tensors (optim specific W.grad = None)
optimizer.zero_grad()
def mlp_init_true_pt(x_dim, y_dim):
    model = MLP(x_dim, y_dim)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
    return model, optimizer
def mlp_forward_true_pt(X, model):
    return model(X)
def mlp_backward_true_pt(loss, model, optimizer):
    loss.backward()
    optimizer.step()
    model.zero_grad()
losses = []
model, optimizer = mlp_init_true_pt(X.shape[-1], Y.shape[-1])
num_epochs, batch_size = 100, 10
dataset    = torch.utils.data.TensorDataset(
    torch.as_tensor(X, dtype=torch.float32),
    torch.as_tensor(Y, dtype=torch.float32)
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
with tqdm.tqdm(total = num_epochs * len(dataloader)) as pbar:
    for epoch in range(num_epochs):
        pbar.set_description(f"Epoch #{epoch+1}")
        for x_batch, y_batch in dataloader:
            y_pred = mlp_forward_true_pt(x_batch, model)
            loss = torch.nn.functional.mse_loss(y_pred, y_batch)
            mlp_backward_true_pt(loss, model, optimizer)
            pbar.update(1)
            pbar.set_postfix(dict(loss=loss.item()))
            losses.append(loss.cpu().item())
pyplot.plot(losses)
with torch.no_grad():
    output = model(torch.tensor([[ 0.69, 0.42 ]]))
output
nn.Dropout, nn.LayerNorm, etc.) behave differently as per inference mode: training or evaluation.train() / eval()# model should be used only for training
model.train()
# model should be used for inference
model.eval() # or model.train(False)
torch.save (with or without class information)torch.load# save and load with existing model object
torch.save(model.state_dict(), "model_dict.pt")
model.load_state_dict(torch.load("model_dict.pt"))
# save and load without object
torch.save(model, "model.pt")
model = torch.load("model.pt")