# gpt2_v1.ipynb# ===================================================================# Imports# ===================================================================import dataclassesfrom dataclasses import dataclassimport loggingimport osfrom os.path import join, existsimport subprocessimport sysimport textwrapfrom textwrap import wrapimport timeimport tiktokenimport torchimport torch.nn as nnfrom torch.nn import functional as F# ===================================================================# Helper Timer Class# ===================================================================class ElapsedTimer:""" Context manager and reusable timer to measure elapsed time. Example: timer = elapsed_timer() with timer: do_something() print(f'Elapsed: {timer.elapsed:.3f}') # Re-enterable: with timer: do_something_else() print(f'Elapsed: {timer.elapsed:.3f}') """def__init__(self):self.start =Noneself._elapsed =Nonedef__enter__(self):self.start = time.perf_counter()returnselfdef__exit__(self, exc_type, exc_value, traceback):self._elapsed = time.perf_counter() -self.start@propertydef elapsed(self):""" Return the elapsed time for the most recent context. """ifself._elapsed isNone:raiseValueError("Timer has not been used in a context yet.")returnself._elapsed# ===================================================================# Globals# ===================================================================LOG_LEVEL ='DEBUG'PARALLELOPEDIA_ROOT = os.environ['PARALLELOPEDIA_ROOT']PARALLELOPEDIA_DATA_DIR = join(PARALLELOPEDIA_ROOT, 'data')MODEL_CHECKPOINT = join( PARALLELOPEDIA_DATA_DIR,'model_19072.pt',)MODEL_DOWNLOAD_URL = ("https://huggingface.co/datasets/trentnelson/""parallelopedia-data-gpt2/resolve/main/model_19072.pt")# Download the model from huggingface if necessary.os.makedirs(PARALLELOPEDIA_DATA_DIR, exist_ok=True)ifnot exists(MODEL_CHECKPOINT):print(f'Downloading {MODEL_DOWNLOAD_URL} via wget ''this might take a while...') args = ["wget","--quiet", MODEL_DOWNLOAD_URL,"-P", PARALLELOPEDIA_DATA_DIR, ] timer = ElapsedTimer()with timer: subprocess.run(args, check=True)print(f'Downloaded model in {timer.elapsed:.3f} seconds.')assert exists(MODEL_CHECKPOINT), "Missing checkpoint."# ===================================================================# Logging# ===================================================================# N.B. We redirect logs to sys.stdout in order for Quarto to pick# them up and include them in rendering the output.logging.basicConfig( level=getattr(logging, LOG_LEVEL),format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stdout)# ===================================================================# Setup# ===================================================================# Use bfloat16 for matmul precision where possible.torch.set_float32_matmul_precision('high')# ===================================================================# GPT2 PyTorch Model Components# ===================================================================# Now define the classes making up our GPT2 implementation.# These map directly to the components introduced by the# now-seminal 2017 "Attention Is All You Need" paper.class CausalSelfAttention(nn.Module):""" Causal self-attention for the GPT2 model. """def__init__(self, config):super().__init__()assert config.n_embd % config.n_head ==0# Key, query, value projections for all heads, but in a batch.self.c_attn = nn.Linear(config.n_embd, 3* config.n_embd)# Output projection.self.c_proj = nn.Linear(config.n_embd, config.n_embd)self.c_proj.NANOGPT_SCALE_INIT =1# Regularization.self.n_head = config.n_headself.n_embd = config.n_embddef forward(self, x):# Batch size, sequence length, embedding dimensionality. B, T, C = (x.size())# Calculate query, key, values for all heads in# batch and move head forward to be the batch dim.## N.B. nh is "number of heads", hs is "head size",# and C (number of channels) is nh * hs.# E.g. in GPT-2 (124M), n_head=12, hs=64, so# nh*hs=C=768 channels in the Transformer. qkv =self.c_attn(x) q, k, v = qkv.split(self.n_embd, dim=2) head_dim = C //self.n_head# (B, nh, T, hs) k = k.view(B, T, self.n_head, head_dim).transpose(1, 2)# (B, nh, T, hs) q = q.view(B, T, self.n_head, head_dim).transpose(1, 2)# (B, nh, T, hs) v = v.view(B, T, self.n_head, head_dim).transpose(1, 2)# Flash attention. y = F.scaled_dot_product_attention(q, k, v, is_causal=True)# Re-assemble all head outputs side by side. y = (y.transpose(1, 2).contiguous().view(B, T, C))# Output projection. y =self.c_proj(y)return yclass MLP(nn.Module):""" Multi-layer perceptron for the GPT2 model. """def__init__(self, config):super().__init__()self.c_fc = nn.Linear(config.n_embd, 4* config.n_embd)self.gelu = nn.GELU(approximate='tanh')self.c_proj = nn.Linear(4* config.n_embd, config.n_embd)self.c_proj.NANOGPT_SCALE_INIT =1def forward(self, x): x =self.c_fc(x) x =self.gelu(x) x =self.c_proj(x)return xclass Block(nn.Module):""" Transformer block for the GPT2 model. """def__init__(self, config):super().__init__()self.ln_1 = nn.LayerNorm(config.n_embd)self.attn = CausalSelfAttention(config)self.ln_2 = nn.LayerNorm(config.n_embd)self.mlp = MLP(config)def forward(self, x): x = x +self.attn(self.ln_1(x)) x = x +self.mlp(self.ln_2(x))return x# ===================================================================# GPT2 Supporting Classes# ===================================================================# N.B. These differ slightly from Andrej's classes in# `train_gpt2.py`. `GPTCheckpoint` is a helper# class I wrote that has no analog in the former.@dataclassclass GPTConfig:""" Configuration class for GPT model. Attributes: block_size (int): Maximum sequence length. vocab_size (int): Number of tokens. GPT2 from huggingface has a vocab size of 50257, which includes 50,000 BPE merges, 256 byte tokens, and 1 <|endoftext|> token. However, Andrej Karpathy's `build-nanogpt/train_gpt2.py` uses a vocab size of 50304. I vaguely recall the explanation for this discrepancy as a local optimization to yield better alignment sizes, but I'm not 100% certain. The local GPT2 training that we did on edu_fineweb10b used 50304, so we will use that here. n_layer (int): Number of layers. n_head (int): Number of attention heads. n_embd (int): Embedding dimension. """ block_size: int=1024 vocab_size: int=50304 n_layer: int=12 n_head: int=12 n_embd: int=768# ===================================================================# GPT2 Model Implementation# ===================================================================class GPT(nn.Module):def__init__(self, config, device):super().__init__()self.config = configself.device = deviceself.manual_seed =42self.transformer = nn.ModuleDict(dict( wte=nn.Embedding(config.vocab_size, config.n_embd), wpe=nn.Embedding(config.block_size, config.n_embd), h=nn.ModuleList( [Block(config) for _ inrange(config.n_layer)] ), ln_f=nn.LayerNorm(config.n_embd), ) )self.lm_head = nn.Linear( config.n_embd, config.vocab_size, bias=False )self.transformer.wte.weight =self.lm_head.weightself.apply(self._init_weights)def _init_weights(self, module):ifisinstance(module, nn.Linear): std =0.02ifhasattr(module, "NANOGPT_SCALE_INIT"): std *= (2*self.config.n_layer) **-0.5 torch.nn.init.normal_(module.weight, mean=0.0, std=std)if module.bias isnotNone: torch.nn.init.zeros_(module.bias)elifisinstance(module, nn.Embedding): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)def forward(self, idx, targets=None):""" Forward pass of the GPT model. Args: idx (torch.Tensor): Supplies the input tensor of shape (B, T). targets (torch.Tensor): Optionally supplies the target tensor of shape (B, T) for computing the loss. """ (B, T) = idx.size()# Forward the token and position embeddings.# Shape (T) pos = torch.arange(0, T, dtype=torch.long, device=idx.device)# Position embeddings of shape (T, n_embd). pos_emb =self.transformer.wpe(pos)# Token embeddings of shape (B, T, n_embd). tok_emb =self.transformer.wte(idx) x = tok_emb + pos_emb# Forward the blocks of the transformer.for block inself.transformer.h: x = block(x)# Forward the final layernorm and the classifier. x =self.transformer.ln_f(x)# (B, T, vocab_size) logits =self.lm_head(x) loss =Noneif targets isnotNone: loss = F.cross_entropy( logits.view(-1, logits.size(-1)), targets.view(-1) )return (logits, loss)@classmethoddef from_local_pretrained( cls, model_path: str, map_location: str="cuda" ):""" Load a model from a local checkpoint. N.B. This is a new method based off GPT.from_pretrained in Andrej Karpathy's train_gpt2.py. Args: cls (type): Supplies the class type. model_path (str): Supplies the path to the model checkpoint. map_location (str): Supplies the device to which the model will be mapped. """with torch.serialization.safe_globals([GPTConfig]): checkpoint = torch.load( model_path, map_location=map_location, ) config = checkpoint["config"] config = GPTConfig(**checkpoint["config"]) model = cls(config, device=map_location) model.load_state_dict(checkpoint["model"]) model.eval() msg = (f"Loaded model from step {checkpoint['step']}, "f"val_loss {checkpoint['val_loss']}" ) logging.info(msg)return modeldef generate(self, text: str, max_length: int=1024, top_k: int=50, seed: int=None, ) ->str:""" Generate text from the model. N.B. This is a new method based off the generation code present in Andrej Karpathy's train_gpt2.py. Args: text (str): Supplies the prompt. max_length (int): Supplies the maximum total length, including prompt. top_k (int): Supplies the number of tokens to consider at each generation step. seed (int): Optionally supplies the manual seed to use for the generator. If None, the model's manual seed will be used. Returns: str: The generated text (including the initial prompt). """self.eval() device =self.device# Obtain our GPT2 tokenizer, and resolve the stop token. enc = tiktoken.get_encoding("gpt2") stop_string ='<|endoftext|>' stop_token = enc.n_vocab -1 actual = enc.decode([stop_token])assert actual == stop_string, (f"expected {stop_string}, got {actual}" )# Encode the prompt. tokens = enc.encode(text) x = torch.tensor( tokens, dtype=torch.long, device=device ).unsqueeze(0)# Create a random generator for reproducibility.if seed isNone: seed =self.manual_seed sample_rng = torch.Generator(device=device) sample_rng.manual_seed(seed)# Generate tokens up to our max length, or until we hit the# stop token. start = time.perf_counter() count =0while x.size(1) < max_length: count +=1with torch.no_grad():# Forward pass, ignoring the returned loss. (logits, _) =self(x)# Take the logits at the last time-step (shape:# (1, vocab_size)). logits = logits[:, -1, :]# Convert to probabilities. probs = F.softmax(logits, dim=-1)# Top-k sampling. topk_probs, topk_indices = torch.topk( probs, k=top_k, dim=-1 )# Sample the next token. next_idx = torch.multinomial( topk_probs, num_samples=1, generator=sample_rng ) next_token = torch.gather(topk_indices, -1, next_idx)# If the next token is the stop token, we're done.if next_token.item() == stop_token:break# Otherwise, append the token to the current sequence# and continue generation. x = torch.cat((x, next_token), dim=1) end = time.perf_counter() elapsed = end - start tokens_per_sec =float(count) / elapsed msg = (f'Generated {count} tokens in {elapsed:.2f} seconds 'f'({tokens_per_sec:.2f} tokens/sec)' ) logging.debug(msg)# Decode the output tokens and return the generated text,# including the initial prompt. output_tokens = x[0].tolist()return enc.decode(output_tokens)
In [2]:
model = GPT.from_local_pretrained( MODEL_CHECKPOINT, map_location='cuda',)model.to('cuda')
2025-02-09 15:26:39,136 - INFO - Loaded model from step 19072, val_loss 3.0519702434539795
N.B. In order to achieve the variable-width output for the generated text depending on the type of device on which the article is being viewed (see theme.scss), each generate() example is replicated in triplicate, with varying width=N parameters to the textwrap.wrap() call. So if you’ve downloaded this notebook and are wondering why there seems to be three versions of certain things when the article only depicts one—that’s why.
In [3]:
prompt ="Albert Einstein's Theory of Relativity stated that"result = model.generate(prompt, seed=42)print('\n'+ textwrap.fill(result, width=105))
2025-02-09 15:26:40,464 - DEBUG - Generated 79 tokens in 0.81 seconds (98.10 tokens/sec)
Albert Einstein's Theory of Relativity stated that the speed of light was approximately 10 000 of
parsecs, whereas quantum physicists have suggested that, as we move further into the universe, the
universe might grow older. The new experiment, conducted by researchers at the University of New Jersey,
New York, and the University of California, Berkeley shows that photons travelling at the speed of light
will be around 30 to 65 kilometres per second.
In [4]:
prompt ="Albert Einstein's Theory of Relativity stated that"result = model.generate(prompt, seed=42)print('\n'+ textwrap.fill(result, width=58))
2025-02-09 15:26:41,014 - DEBUG - Generated 79 tokens in 0.54 seconds (145.49 tokens/sec)
Albert Einstein's Theory of Relativity stated that the
speed of light was approximately 10 000 of parsecs,
whereas quantum physicists have suggested that, as we move
further into the universe, the universe might grow older.
The new experiment, conducted by researchers at the
University of New Jersey, New York, and the University of
California, Berkeley shows that photons travelling at the
speed of light will be around 30 to 65 kilometres per
In [5]:
prompt ="Albert Einstein's Theory of Relativity stated that"result = model.generate(prompt, seed=42)print('\n'+ textwrap.fill(result, width=45))
2025-02-09 15:26:41,565 - DEBUG - Generated 79 tokens in 0.54 seconds (145.05 tokens/sec)
Albert Einstein's Theory of Relativity stated
that the speed of light was approximately 10
000 of parsecs, whereas quantum physicists
have suggested that, as we move further into
the universe, the universe might grow older.
The new experiment, conducted by researchers
at the University of New Jersey, New York,
and the University of California, Berkeley
shows that photons travelling at the speed of
light will be around 30 to 65 kilometres per
In [6]:
result = model.generate(prompt, seed=20190903)print('\n'+ textwrap.fill(result, width=100))
2025-02-09 15:26:57,518 - DEBUG - Generated 1015 tokens in 15.95 seconds (63.65 tokens/sec)
Albert Einstein's Theory of Relativity stated that the speed of light is the same as it is in two
places, which means that a given speed can either be described by two different speed equations
directly or they may be both equations. It is then assumed that the speed of light is the speed of
the universe or the universe's existence relative to Earth. In relativity, a measure of the speed of
light is the absolute speed of the light. As long as the speed of light is less than its speed in
two different places, the absolute speed can be calculated. For example, the absolute speed is
1/2990000000 (2,299,792,458) km/hr with an absolute speed about 10 times as fast as it is in two
different places. Now we can use the following equation to describe the speed of light: E = C/C2 The
speed of light, as a function of C, is a constant. By Einstein's definition of relativity, the speed
of light is a constant. This is because light travels at its maximum speed along the direction (if
it's travelling above the speed of light, the point where light must be observed is called
"aperture" of the speed of light). The speed of light is about half as fast as the speed of light
because the speed of light has a smaller varying velocity for each direction of radiation. The speed
of light, as a function of C, is a constant. The speed of a wave is the constant measured along the
direction of the wave relative to its location in space. E = C/C2 where E is the speed of light
along the direction of the wave. Because the speed of the wave is the speed of the particle in the
wave, and c the speed of the particle, E's is also given by the speed of light. For example, a light
particle is moving from its place of greatest velocity to its location of greatest velocity. E.g. C
= F/d, C = d/d For most materials and most other objects, the speed of light is the same for all
wavelengths. The speed of light is, on the other hand, the speed of the energy form of a photon.
E.g. c = C/d, C = e/d For most particles, light travels over one degree of separation and this is
how photons interact with other particles. We can compare a particle's velocity to an object's
velocity. The speed of light is measured by the distance between the particle's nose and the surface
of the object. For example, a photon of light emits the energy of a single photon. If a photon of
another type is fired at the same speed as the first, it will get out of the light, but a photon of
the other type will not get back to the ground. The fractional energy will be reduced. The distance
between two photons of the same type will be reduced to the square of their energies. E.g. C = C/C2,
C = -D/d., D = 9/6 A photon of color does not have sufficient energy to be emitted by that color and
is therefore subject to The speed of light is the change in velocity over time. This is a constant,
but sometimes it is possible to express it like this: E = c2/e In relativity, the length of the
distance is the length of time the length of wave is divided by the speed of light. E.g. a beam of
light travelling at about 9.2 miles per second must travel at around 7.3 miles per second to get
E.g. a beam moving at 3.2 miles per second must travel at around 8 miles per second to get E.g. a
beam moving at 1.8 miles per second must travel at 9.0 miles per second to get E.g. an object going
at 2.3 miles per second must travel at 1.8 miles per second to get E.g. a beam moving at 2.3 miles
per second must travel at 3.4 miles per second to get E.g.. a beam traveling at 3.4 miles per second
to get E.g.. a beam moving at 2.3 miles per second must travel at 3.8 miles per second to get E.g..
a beam traveling at 3.8 miles per second to get E.g.. a beam moving at about 4.4 miles per second
must travel at about 3.9 miles per second to get E.g.. a beam moving at 5.5 miles per second to get
a beam moving at 5.9 miles per S.G.D.. is the same thing as a mass. The distance is a unit in terms
of the speed of light. Determining the speed of light is an additional measure of the energy. For
most things
In [7]:
result = model.generate(prompt, seed=20190903)print('\n'+ textwrap.fill(result, width=54))
2025-02-09 15:27:13,333 - DEBUG - Generated 1015 tokens in 15.81 seconds (64.21 tokens/sec)
Albert Einstein's Theory of Relativity stated that the
speed of light is the same as it is in two places,
which means that a given speed can either be described
by two different speed equations directly or they may
be both equations. It is then assumed that the speed
of light is the speed of the universe or the
universe's existence relative to Earth. In relativity,
a measure of the speed of light is the absolute speed
of the light. As long as the speed of light is less
than its speed in two different places, the absolute
speed can be calculated. For example, the absolute
speed is 1/2990000000 (2,299,792,458) km/hr with an
absolute speed about 10 times as fast as it is in two
different places. Now we can use the following
equation to describe the speed of light: E = C/C2 The
speed of light, as a function of C, is a constant. By
Einstein's definition of relativity, the speed of
light is a constant. This is because light travels at
its maximum speed along the direction (if it's
travelling above the speed of light, the point where
light must be observed is called "aperture" of the
speed of light). The speed of light is about half as
fast as the speed of light because the speed of light
has a smaller varying velocity for each direction of
radiation. The speed of light, as a function of C, is
a constant. The speed of a wave is the constant
measured along the direction of the wave relative to
its location in space. E = C/C2 where E is the speed
of light along the direction of the wave. Because the
speed of the wave is the speed of the particle in the
wave, and c the speed of the particle, E's is also
given by the speed of light. For example, a light
particle is moving from its place of greatest velocity
to its location of greatest velocity. E.g. C = F/d, C
= d/d For most materials and most other objects, the
speed of light is the same for all wavelengths. The
speed of light is, on the other hand, the speed of the
energy form of a photon. E.g. c = C/d, C = e/d For
most particles, light travels over one degree of
separation and this is how photons interact with other
particles. We can compare a particle's velocity to an
object's velocity. The speed of light is measured by
the distance between the particle's nose and the
surface of the object. For example, a photon of light
emits the energy of a single photon. If a photon of
another type is fired at the same speed as the first,
it will get out of the light, but a photon of the
other type will not get back to the ground. The
fractional energy will be reduced. The distance
between two photons of the same type will be reduced
to the square of their energies. E.g. C = C/C2, C =
-D/d., D = 9/6 A photon of color does not have
sufficient energy to be emitted by that color and is
therefore subject to The speed of light is the change
in velocity over time. This is a constant, but
sometimes it is possible to express it like this: E =
c2/e In relativity, the length of the distance is the
length of time the length of wave is divided by the
speed of light. E.g. a beam of light travelling at
about 9.2 miles per second must travel at around 7.3
miles per second to get E.g. a beam moving at 3.2
miles per second must travel at around 8 miles per
second to get E.g. a beam moving at 1.8 miles per
second must travel at 9.0 miles per second to get E.g.
an object going at 2.3 miles per second must travel at
1.8 miles per second to get E.g. a beam moving at 2.3
miles per second must travel at 3.4 miles per second
to get E.g.. a beam traveling at 3.4 miles per second
to get E.g.. a beam moving at 2.3 miles per second
must travel at 3.8 miles per second to get E.g.. a
beam traveling at 3.8 miles per second to get E.g.. a
beam moving at about 4.4 miles per second must travel
at about 3.9 miles per second to get E.g.. a beam
moving at 5.5 miles per second to get a beam moving at
5.9 miles per S.G.D.. is the same thing as a mass. The
distance is a unit in terms of the speed of light.
Determining the speed of light is an additional
measure of the energy. For most things
In [9]:
result = model.generate(prompt, seed=20190903)print('\n'+ textwrap.fill(result, width=40))
2025-02-09 15:27:56,654 - DEBUG - Generated 1015 tokens in 15.89 seconds (63.87 tokens/sec)
Albert Einstein's Theory of Relativity
stated that the speed of light is the
same as it is in two places, which means
that a given speed can either be
described by two different speed
equations directly or they may be both
equations. It is then assumed that the
speed of light is the speed of the
universe or the universe's existence
relative to Earth. In relativity, a
measure of the speed of light is the
absolute speed of the light. As long as
the speed of light is less than its
speed in two different places, the
absolute speed can be calculated. For
example, the absolute speed is
1/2990000000 (2,299,792,458) km/hr with
an absolute speed about 10 times as fast
as it is in two different places. Now we
can use the following equation to
describe the speed of light: E = C/C2
The speed of light, as a function of C,
is a constant. By Einstein's definition
of relativity, the speed of light is a
constant. This is because light travels
at its maximum speed along the direction
(if it's travelling above the speed of
light, the point where light must be
observed is called "aperture" of the
speed of light). The speed of light is
about half as fast as the speed of light
because the speed of light has a smaller
varying velocity for each direction of
radiation. The speed of light, as a
function of C, is a constant. The speed
of a wave is the constant measured along
the direction of the wave relative to
its location in space. E = C/C2 where E
is the speed of light along the
direction of the wave. Because the speed
of the wave is the speed of the particle
in the wave, and c the speed of the
particle, E's is also given by the speed
of light. For example, a light particle
is moving from its place of greatest
velocity to its location of greatest
velocity. E.g. C = F/d, C = d/d For most
materials and most other objects, the
speed of light is the same for all
wavelengths. The speed of light is, on
the other hand, the speed of the energy
form of a photon. E.g. c = C/d, C = e/d
For most particles, light travels over
one degree of separation and this is how
photons interact with other particles.
We can compare a particle's velocity to
an object's velocity. The speed of light
is measured by the distance between the
particle's nose and the surface of the
object. For example, a photon of light
emits the energy of a single photon. If
a photon of another type is fired at the
same speed as the first, it will get out
of the light, but a photon of the other
type will not get back to the ground.
The fractional energy will be reduced.
The distance between two photons of the
same type will be reduced to the square
of their energies. E.g. C = C/C2, C =
-D/d., D = 9/6 A photon of color does
not have sufficient energy to be emitted
by that color and is therefore subject
to The speed of light is the change in
velocity over time. This is a constant,
but sometimes it is possible to express
it like this: E = c2/e In relativity,
the length of the distance is the length
of time the length of wave is divided by
the speed of light. E.g. a beam of light
travelling at about 9.2 miles per second
must travel at around 7.3 miles per
second to get E.g. a beam moving at 3.2
miles per second must travel at around 8
miles per second to get E.g. a beam
moving at 1.8 miles per second must
travel at 9.0 miles per second to get
E.g. an object going at 2.3 miles per
second must travel at 1.8 miles per
second to get E.g. a beam moving at 2.3
miles per second must travel at 3.4
miles per second to get E.g.. a beam
traveling at 3.4 miles per second to get
E.g.. a beam moving at 2.3 miles per
second must travel at 3.8 miles per
second to get E.g.. a beam traveling at
3.8 miles per second to get E.g.. a beam
moving at about 4.4 miles per second
must travel at about 3.9 miles per
second to get E.g.. a beam moving at 5.5
miles per second to get a beam moving at
5.9 miles per S.G.D.. is the same thing
as a mass. The distance is a unit in
terms of the speed of light. Determining
the speed of light is an additional
measure of the energy. For most things