Tutorial 2: Gradient Descent - ADSC 4720 - Data Mining for Data Science

Imagine we want to build a simple AI that predicts a student’s test score (y) based on how many hours they studied (x).

We will assume a simple linear relationship passing through zero:

y=w⋅x

(1)

import torch

# Set up
X = torch.tensor([1.0, 2.0, 3.0])
Y = torch.tensor([2.0, 4.0, 6.0])
w = torch.tensor(1.0, requires_grad=True)
print(x,y,w)
learning_rate = 0.1

tensor([1., 2., 3.]) tensor([2., 4., 6.]) tensor(1., requires_grad=True)

# compute prediction 
y_pred = w*x
print(y_pred)

tensor(3.9840, grad_fn=<MulBackward0>)

# comput loss
loss = (y_pred - y)**2
print(loss)

tensor(0.0003, grad_fn=<PowBackward0>)

w.grad.zero_()
loss.backward()
w.grad.item()

-0.06400012969970703

with torch.no_grad():
    w -= learning_rate * w.grad

print(w.item())

1.9919999837875366

y_pred = w*x
y_pred

tensor(3.6000, grad_fn=<MulBackward0>)

Put everything together

tensor([[1.0000, 1.2000],
        [2.0000, 1.2000]])

# Set up
X = torch.tensor([[1.0,1.2],[2.0, 1.2] ])
Y = torch.tensor([[2.0], [4.0]])
W = torch.tensor([[1.0], [1.0]], requires_grad=True)
# b = torch.tensor(0.0, requires_grad=True)
learning_rate = 0.1

print(W)
for i in range(3):
    # forward pass
    y_pred = X@W
    # loss
    loss = ((y_pred - Y)**2).mean()
    # backward pass
    loss.backward()
    # update weight
    with torch.no_grad():
        W -= learning_rate * W.grad
    # reset gradient
    print(f"Gradient: {W.grad}")
    W.grad.zero_()
    # b.grad.zero_()

    print(f"parameter: w={W}")
      
    print(f"Prediction: {y_pred}")

final_y = W*X
print(final_y)

tensor([[1.],
        [1.]], requires_grad=True)
Gradient: tensor([[-1.4000],
        [-0.7200]])
parameter: w=tensor([[1.1400],
        [1.0720]], requires_grad=True)
Prediction: tensor([[2.2000],
        [3.2000]], grad_fn=<MmBackward0>)
Gradient: tensor([[-0.4408],
        [-0.0086]])
parameter: w=tensor([[1.1841],
        [1.0729]], requires_grad=True)
Prediction: tensor([[2.4264],
        [3.5664]], grad_fn=<MmBackward0>)
Gradient: tensor([[-0.2173],
        [ 0.1525]])
parameter: w=tensor([[1.2058],
        [1.0576]], requires_grad=True)
Prediction: tensor([[2.4715],
        [3.6556]], grad_fn=<MmBackward0>)
tensor([[1.2058, 1.4470],
        [2.1152, 1.2691]], grad_fn=<MulBackward0>)

tensor([[1.0000, 2.0000, 3.0000],
        [1.2000, 1.2000, 1.3000]])

tensor([[1.1667],
        [1.1820]], requires_grad=True)

import torch

# 1. Setup
x = torch.tensor(2.0)
y = torch.tensor(4.0)
w = torch.tensor(1.0, requires_grad=True)
learning_rate = 0.1

print(f"Initial Prediction: {w * x:.1f}")  # Output: 2.0

# 2. Forward & Loss
y_pred = w * x
loss = (y_pred - y)**2

# 3. Backward
loss.backward()
print(f"Gradient: {w.grad.item():.1f}")     # Output: -8.0

# 4. Update
with torch.no_grad():
    w -= learning_rate * w.grad

print(f"New Weight: {w.item():.1f}")        # Output: 1.8
print(f"New Prediction: {w * x:.1f}")       # Output: 3.6

Initial Prediction: 2.0
Gradient: -8.0
New Weight: 1.8
New Prediction: 3.6

import torch

# 1. Setup
x = torch.tensor(2.0)
y = torch.tensor(4.0)
w = torch.tensor(1.0, requires_grad=True)
learning_rate = 0.1

print(f"{'Iter':<5} | {'w (Start)':<10} | {'Pred':<10} | {'Grad':<10} | {'w (End)':<10}")
print("-" * 55)

# LOOP FOR 3 ITERATIONS
for i in range(3):
    # A. Forward Pass
    y_pred = w * x
    
    # B. Compute Loss
    loss = (y_pred - y)**2
    
    # C. Backward Pass (Calculate Gradient)
    loss.backward()
    
    # Capture current state for printing
    current_w = w.item()
    current_grad = w.grad.item()
    current_pred = y_pred.item()
    
    # D. Update Weight
    with torch.no_grad():
        w -= learning_rate * w.grad
        
    # E. IMPORTANT: Zero the gradient!
    # If we forget this, the gradient in Iter 2 becomes (-8 + -1.6) = -9.6!
    w.grad.zero_()
    
    print(f"{i+1:<5} | {current_w:<10.3f} | {current_pred:<10.3f} | {current_grad:<10.3f} | {w.item():<10.3f}")

print(f"\nFinal Prediction: {w * x:.3f} (Target: 4.000)")

Iter  | w (Start)  | Pred       | Grad       | w (End)   
-------------------------------------------------------
1     | 1.000      | 2.000      | -8.000     | 1.800     
2     | 1.800      | 3.600      | -1.600     | 1.960     
3     | 1.960      | 3.920      | -0.320     | 1.992     

Final Prediction: 3.984 (Target: 4.000)

import torch

# 1. Dataset (Batch)
X = torch.tensor([1.0, 2.0, 3.0])
Y = torch.tensor([2.0, 4.0, 6.0])

# 2. Initialize
w = torch.tensor(1.0, requires_grad=True)
learning_rate = 0.1

print(f"{'Epoch':<5} | {'w (Start)':<10} | {'Avg Grad':<10} | {'w (End)':<10} | {'Loss (MSE)':<10}")
print("-" * 65)

# 3. Training Loop
for epoch in range(3):
    # A. Forward Pass (Predict all 3 at once)
    y_pred = w * X
    
    # B. Loss (Mean Squared Error)
    # This automatically sums errors and divides by 3
    loss = ((y_pred - Y)**2).mean()
    
    # C. Backward Pass (Compute Avg Gradient)
    loss.backward()
    
    # Save values for printing
    current_w = w.item()
    grad = w.grad.item()
    
    # D. Update
    with torch.no_grad():
        w -= learning_rate * w.grad
        
    # E. Zero Gradient
    w.grad.zero_()
    
    print(f"{epoch+1:<5} | {current_w:<10.3f} | {grad:<10.3f} | {w.item():<10.3f} | {loss.item():<10.4f}")

print(f"\nFinal Weight: {w.item():.4f} (Target: 2.0)")

Epoch | w (Start)  | Avg Grad   | w (End)    | Loss (MSE)
-----------------------------------------------------------------
1     | 1.000      | -9.333     | 1.933      | 4.6667    
2     | 1.933      | -0.622     | 1.996      | 0.0207    
3     | 1.996      | -0.041     | 2.000      | 0.0001    

Final Weight: 1.9997 (Target: 2.0)