Colab with all code snippet for [The Art of Transformer Programming book](https://yanivle.github.io//taotp.pdf).

# Chapter 1 - The Transformer

In [1]:
import numpy as np

def norm(x, axis=-1, epsilon: float = 1e-10):
  return x / (x.sum(axis=axis, keepdims=True) + epsilon)

def softmax(x, axis=-1):
  return norm(np.exp(x - np.max(x, axis=axis, keepdims=True)), axis=axis)

def layer_norm(x, gamma = 1., beta = 0., axis = -1, epsilon = 1e-10):
  return beta + gamma * (x - x.mean(axis=axis, keepdims=True)) / (
    x.var(axis=axis, keepdims=True) ** 0.5 + epsilon)

def self_attn(x, Q, K, V, P, mask):
  queries, keys, values = (np.einsum('nj,hjk->nhk', x, M) for M in (Q, K, V))
  qk = np.einsum('nhk,mhk->nmh', queries, keys) / (Q.shape[-1] ** 0.5)
  attn_weights = softmax(np.where(mask[..., None], qk, float('-inf')), axis=1)
  x = np.einsum('nmh,mhk->nhk', attn_weights, values)
  return np.einsum('nhk,hdk->nd', x, P)

def transformer_layer(x, Q, K, V, P, M1, b1, M2, b2, ln1, ln2, mask):
  x = x + self_attn(layer_norm(x, **ln1), Q, K, V, P, mask)
  return x + np.maximum(layer_norm(x, **ln2) @ M1 + b1, 0) @ M2 + b2

def transformer(tokens, tok_emb, pos_emb, out_emb, layers, lnf):
  n, = tokens.shape
  causal_mask = np.tril(np.ones((n, n)))
  x = tok_emb[tokens] + pos_emb[np.arange(n)]
  for layer in layers:
    x = transformer_layer(x, **layer, mask=causal_mask)
  return np.einsum('nd,vd->nv', layer_norm(x, **lnf), out_emb)

In [2]:
def decode(tokens, params, max_steps=1000, eos=None):
  for _ in range(max_steps):
    next_tok = np.argmax(transformer(tokens, **params)[-1])
    tokens = np.append(tokens, next_tok)
    if next_tok == eos: break
  return tokens

In [3]:
def base_layer(n_heads, d_model, d_ff, d_head):
    return {
        'Q': np.zeros((n_heads, d_model, d_head)),
        'K': np.zeros((n_heads, d_model, d_head)),
        'V': np.zeros((n_heads, d_model, d_head)),
        'P': np.zeros((n_heads, d_model, d_head)),
        'M1': np.zeros((d_model, d_ff)),
        'b1': np.zeros((d_ff,)),
        'M2': np.zeros((d_ff, d_model)),
        'b2': np.zeros((d_model,)),
        'ln1': {'gamma': 1., 'beta': 0.},
        'ln2': {'gamma': 1., 'beta': 0.},
    }

def base_params(vocab_size, block_size, d_model):
    return {
        'tok_emb': np.zeros((vocab_size, d_model)),
        'out_emb': np.zeros((vocab_size, d_model)),
        'pos_emb': np.zeros((block_size, d_model)),
        'layers': [],
        'lnf': {'gamma': 1., 'beta': 0.},
    }

In [4]:
from functools import partial

class HumanReadableDict(dict):
    def __repr__(self) -> str:
        return "{" + ", ".join(f"{k}: {v:,}" for k, v in self.items()) + "}"

def count_params(params, count=lambda x: x.size):
    embs = ["tok_emb", "pos_emb"]
    if not np.all(params["tok_emb"] == params["out_emb"]):
        embs.append("out_emb")
    emb_params = {emb: count(params[emb]) for emb in embs}
    non_emb_params = 0
    for layer in params["layers"]:
        for param in layer.values():
            if isinstance(param, np.ndarray):
                non_emb_params += count(param)
    return HumanReadableDict(
        {"n_params": sum(emb_params.values()) + non_emb_params}
        | emb_params
        | {"non_emb": non_emb_params})

count_non0_params = partial(count_params, count=lambda x: (x != 0).sum())

# Chaper 2 - Hello World

In [5]:
def simpleformer(seq, pos_emb, out_emb):
  x = pos_emb[np.arange(len(seq))]
  return np.einsum('nd,vd->nv', x, out_emb)

def decode_simpleformer(tokens, pos_emb, out_emb, eos):
  while True:
    next_tok = np.argmax(simpleformer(tokens, pos_emb, out_emb)[-1])
    tokens = np.append(tokens, next_tok)
    if next_tok == eos: break
  return tokens

In [6]:
message = ["<bos>"] + list("Hello World!") + ["<eos>"]
tokenizer = {s: i for i, s in enumerate(set(message))}
detokenizer = {i: s for s, i in tokenizer.items()}
tokenize = lambda s: np.array([tokenizer[c] for c in s])
detokenize = lambda a: "".join([detokenizer[i] for i in a])

t = np.linspace(0, 6, len(tokenizer))  # 6 = 2 pi - epsilon.
out_emb = np.stack((np.cos(t), np.sin(t)), axis=1)
pos_emb = out_emb[tokenize(message[1:])]
output = detokenize(
  decode_simpleformer(tokenize(["<bos>"]), pos_emb, out_emb, eos=tokenizer["<eos>"])[1:-1])

assert output == "Hello World!"

In [7]:
def pos_enc_3d(t):
    x, y = np.cos(t) / (2 ** .5), np.sin(t) / (6 ** .5)
    return np.array([x + y, -x + y, -2 * y]) * (3 ** .5)

def pos_enc_3d_array(n):
    return np.stack([pos_enc_3d(2 * np.pi * t / n) for t in range(n)])

In [8]:
block_size = len(message) - 1
vocab_size = len(tokenizer)
params = base_params(vocab_size, block_size, d_model=3)
params['tok_emb'] = params['out_emb'] = pos_enc_3d_array(vocab_size)
params['pos_emb'] = params['out_emb'][tokenize(message)[1:]] * 1_000_000

output = detokenize(decode(tokenize(['<bos>']), params, eos=tokenizer['<eos>'])[1:-1])

assert output == 'Hello World!'

# Chapter 3 - Lookup Table

In [9]:
def pos_3d_enc_rotation_matrix(theta):
    T = np.empty((3, 3))
    T[0] = pos_enc_3d(0)
    T[1] = pos_enc_3d(1)
    T[2] = pos_enc_3d(2)

    S = np.empty((3, 3))
    S[0] = pos_enc_3d(0 + theta)
    S[1] = pos_enc_3d(1 + theta)
    S[2] = pos_enc_3d(2 + theta)

    return np.linalg.solve(T, S)

In [11]:
def build_untied_transformer(ks, vs, vocab_size, block_size, prefix_len, tok_dims):
    d_head = tok_dims + 3
    n_heads = prefix_len
    d_model = d_head  # We don't need the standard d_head * n_heads.

    params = base_params(vocab_size, block_size, d_model)

    params['pos_emb'] = np.pad(pos_enc_3d_array(block_size), ((0, 0), (tok_dims, 0)))
    params['tok_emb'] = np.pad(
        layer_norm(np.random.normal(0, 1, (vocab_size, tok_dims))), ((0, 0), (0, 3)))
    hash_matrices = np.random.normal(0, 1e-1, (prefix_len, tok_dims, tok_dims))

    layer0 = base_layer(n_heads, d_model, 0, d_head)
    params['layers'].append(layer0)
    for head in range(n_heads):
        theta = -head * 2 * np.pi / block_size
        layer0['Q'][head, -3:, -3:] = pos_3d_enc_rotation_matrix(theta) * 1e8
        layer0['K'][head, -3:, -3:] = np.eye(3)
        layer0['V'][head] = np.eye(d_model)
        layer0['P'][head, :tok_dims, :tok_dims] = hash_matrices[head].T
    layer0['P'][0] -= np.eye(d_model)  # Clean up residual.

    causal_mask = np.tril(np.ones((prefix_len, prefix_len)))
    for k, v in zip(ks, vs):
        x = params['tok_emb'][k] + params['pos_emb'][np.arange(len(k))]
        params['out_emb'][v] += transformer_layer(x, **layer0, mask=causal_mask)[-1]

    return params

In [23]:
def logsumexp(x, axis=-1):
  mx = x.max(axis=axis, keepdims=True)
  return np.expand_dims(np.log(np.sum(np.exp(x - mx), axis=axis)), axis=axis) + mx

def loss_and_grad(E, M, k, v, y):
  V, d = E.shape
  x = E[k]  # n, l, d
  h = np.einsum('nld,ldo->no', x, M)  # n, d
  mean = h.sum(axis=1)[:, None] / (d + 3)
  h0 = h - mean
  var = ((h0 ** 2).sum(axis=1)[:, None] + 3 * mean ** 2) / (d + 3)  # n, 1
  std = var ** 0.5 + 1e-10  # n, 1
  h01 = h0 / std  # n, d
  z = np.einsum('vd,nd->nv', E, h01)  # n, V
  s = softmax(z)  # n, V
  loss =  (logsumexp(z) - z)[np.arange(len(z)), v].sum()
  
  dz = s - y  # n, V
  dh01 = np.einsum('nv,vd->nd', dz, E)  # n, d
  dstd = np.einsum('nd,nd->n', dh01, h0)[:, None] / (-std ** 2)  # n, 1
  dvar = dstd / (2 * (std - 1e-10))  # n, 1
  dh0 = dh01 / std + 2 * dvar * h0 / (d + 3)  # n, d
  dmean = dvar * 6 / (d + 3) * mean - dh0.sum(axis=1)[:, None]
  dh = dh0 + dmean / (d + 3)
  dM = np.einsum('no,nld->ldo', dh, x)
  dx = np.einsum('no,ldo->nld', dh, M)
  dE = np.einsum('nd,nv->vd', h01, dz)
  np.add.at(dE, k, dx)
  
  return loss, (dM, dE)

In [24]:
from tqdm import tqdm

onehot = lambda t, v: np.eye(v)[t]

def optimize(E, M, ks, vs, iters, lr=1e-3):
  vocab_size = E.shape[0]
  y = onehot(vs, vocab_size)
  with tqdm(range(iters)) as pb:
    for _ in pb:
      E[:] = layer_norm(E)
      loss, (dM, dE) = loss_and_grad(E, M, ks, vs, y)
      E -= lr * dE
      M -= lr * dM
      pb.set_description(f'{loss=}')

In [25]:
def build_lookup_transformer(ks, vs, vocab_size, block_size, tok_dims, optimization_iters):
  n, prefix_len = ks.shape
  d_head = tok_dims + 3
  n_heads = prefix_len
  d_model = d_head  # We don't need the standard d_head * n_heads.
 
  params = base_params(vocab_size, block_size, d_model)

  params['pos_emb'] = np.pad(pos_enc_3d_array(block_size), ((0, 0), (tok_dims, 0)))

  tok_emb = np.random.normal(0, 1e-1, (vocab_size, tok_dims))
  hash_matrices = np.random.normal(0, 1e-1, (prefix_len, tok_dims, tok_dims))
  optimize(tok_emb, hash_matrices, ks, vs, optimization_iters)
  params['tok_emb'] = params['out_emb'] = np.pad(layer_norm(tok_emb), ((0, 0), (0, 3)))

  layer0 = base_layer(n_heads, d_model, 0, d_head)
  params['layers'].append(layer0)
  for head in range(n_heads):
    theta = -head * 2 * np.pi / block_size
    layer0['Q'][head, -3:, -3:] = pos_3d_enc_rotation_matrix(theta) * 1e8
    layer0['K'][head, -3:, -3:] = np.eye(3)
    layer0['V'][head] = np.eye(d_model)
    layer0['P'][head, :tok_dims, :tok_dims] = hash_matrices[-1 - head].T
  layer0['P'][0] -= np.eye(d_model)  # Clean up residual.

  return params

# Chapter 4 - Search

In [26]:
def padding_block(d):
  assert d >= 2, 'Padding blocks need at least 2 dimensions.'
  return np.array([(d - 1) ** .5] + [-1 / ((d - 1) ** .5) for i in range(d - 1)])

In [27]:
def build_search_transformer(vocab_size, block_size, prefix_len):
    d_head = 3 * prefix_len
    n_heads = prefix_len + 1
    d_model = d_head + 3

    params = base_params(vocab_size, block_size, d_model)
    params['pos_emb'] = np.pad(pos_enc_3d_array(block_size), ((0, 0), (d_model - 3, 0)))
    ln_padding = np.tile(padding_block(d_model - 6), (vocab_size, 1))
    tok_emb = pos_enc_3d_array(vocab_size)
    tok_emb = np.concatenate((tok_emb, ln_padding), axis=-1)
    params['tok_emb'] = params['out_emb'] = np.pad(tok_emb, ((0, 0), (0, 3)))

    # d_head is too large for this layer (except for cleanup head0 we could do with 3 dims).
    layer0 = base_layer(n_heads, d_model, 0, d_head)
    params['layers'].append(layer0)
    for head in range(n_heads):
        theta = -head * 2 * np.pi / block_size
        layer0['Q'][head, -3:, -3:] = pos_3d_enc_rotation_matrix(theta) * 1e8
        layer0['K'][head, -3:, -3:] = np.eye(3)
        if head == 0:  # Head0 just cleans up everything except for our own token embedding.
            layer0['V'][head, 3:] = np.eye(d_head)
            layer0['P'][head, 3:] = -np.eye(d_head) # Clean up residual.
        else:
            layer0['V'][head, :3, :3] = np.eye(3)
            layer0['P'][head, head * 3: (head + 1) * 3, :3] = np.eye(3)

    # n_heads is too large for this layer (we could do with 1 head + cleanup).
    layer1 = base_layer(n_heads, d_model, 0, d_head)
    params['layers'].append(layer1)
    layer1['Q'][0, :-3] = np.eye(d_head) * 1e8
    layer1['K'][0, 3:] = np.eye(d_head)
    layer1['V'][0, :3, :3] = np.eye(3)
    layer1['P'][0, :3, :3] = np.eye(3)

    # Clean up heads (we could scale head0's P by e.g. 1e7 and drop these):
    assert prefix_len >= 2
    layer1['Q'][1, :-3] = np.eye(d_head) * 1e8
    layer1['K'][1, :-3] = np.eye(d_head)
    layer1['V'][1, :-3] = np.eye(d_head)
    layer1['P'][1, :-3] = -np.eye(d_head)
    layer1['Q'][2, -3:, -3:] = np.eye(3) * 1e8
    layer1['K'][2, -3:, -3:] = np.eye(3)
    layer1['V'][2, -3:, -3:] = np.eye(3)
    layer1['P'][2, -3:, -3:] = -np.eye(3)

    return params

# Chapter 5 - Sort

In [28]:
def build_min_transformer(vocab_size, block_size):
    params = base_params(vocab_size, block_size, d_model=3)
    tok_emb = pos_enc_3d_array(2 * vocab_size + 2)[1:1 + vocab_size]
    params['tok_emb'] = params['out_emb'] = tok_emb

    layer0 = base_layer(n_heads=1, d_model=3, d_ff=0, d_head=3)
    params['layers'].append(layer0)

    layer0['Q'][0, :, 0] = pos_enc_3d(np.pi / 2) * 1e8
    layer0['K'][0, :, 0] = pos_enc_3d(0)
    layer0['V'][0] = np.eye(3)
    layer0['P'][0] = np.eye(3) * 1e6

    return params

In [29]:
def stretch(x, mx):
    return mx * x / x.max()

def powerpoints(n, mx=np.pi):
    return stretch(np.cumsum(np.concatenate((np.zeros(1), 1 / 2 ** np.arange(n - 1)))), mx)

def nextpoint(ts):
    return np.concatenate((ts[1:-1] * 3 / 4 + ts[2:] / 4, np.array([np.pi, 0])))

In [30]:
def build_sort_transformer(n=28, block_size=100):
    pi = pos_enc_3d(powerpoints(n)).T
    qi = pos_enc_3d(nextpoint(powerpoints(n))).T

    params = base_params(vocab_size=n, block_size=block_size, d_model=6)

    layer0 = base_layer(n_heads=1, d_model=6, d_ff=0, d_head=6)
    layer0['Q'][3:, :3] = np.eye(3) * 1e20
    layer0['K'][:3, :3] = np.eye(3)
    layer0['V'][:] = np.eye(6) * 1e20  # Instead of cleanup.
    layer0['P'][:] = np.eye(6)

    params['tok_emb'] = params['out_emb'] = np.concatenate((pi, qi), axis=1)
    params['layers'].append(layer0)

    return params

# Chapter 6 - Decimal Addition

In [31]:
def build_MLP(x0s, a0s):
    S = np.sort(x0s)
    min_sep = np.min(S[1:] - S[:-1])
    M1 = np.array([[1, 1, 1] * len(x0s)])
    b1 = np.concatenate([np.array([0, 1, -1]) - x0 / min_sep for x0 in x0s])
    M2 = np.stack([
            np.concatenate([np.array([-2, 1, 1]) * a0[i] for a0 in a0s])
            for i in range(len(a0s[0]))], axis=-1)
    b2 = np.array([0] * len(a0s[0]))
    return M1 / min_sep, b1, M2, b2

In [32]:
def build_single_digit_mod_transformer():
    params = base_params(vocab_size=10, block_size=2, d_model=4)

    tok_emb = np.empty((10, 4))
    for i in range(10):
        x = i / 100
        a = (-x - 1 + (-3 * x ** 2 - 2 * x + 5) ** 0.5) / 2
        tok_emb[i] = np.array([x, 1, a, -x - a - 1])
    params['tok_emb'] = params['out_emb'] = tok_emb

    layer0 = base_layer(n_heads=1, d_model=4, d_ff=3 * 19, d_head=4)
    layer0['V'][0, :2, :2] = np.eye(2) * 1e4  # * 1e4 for messy cleanup.
    layer0['V'][0, 0, 0] *= 2 * 100  # * 2 for averaging, * 100 for tok_emb factor.
    layer0['P'][0, :2, :2] = np.eye(2)

    x0s = [layer_norm(np.array([i * 1e4, 1e4, 0., 0.]))[0] for i in range(19)]
    a0s = [tok_emb[i % 10] * 1e10 for i in range(19)]
    layer0['M1'][0], layer0['b1'], layer0['M2'], layer0['b2'] = build_MLP(x0s, a0s)

    params['layers'].append(layer0)

    return params

In [33]:
from itertools import product

def build_multi_digit_mod_transformer(n_digits):
    vocab_size = 10 + 2  # 10 digits, '+' (10) and '=' (11).
    block_size = 2 * n_digits + 2  # n_digits for each number +2 for '+' and '='.
    d_model = 7  # 4 for token, 3 for position.
    params = base_params(vocab_size, block_size, d_model)

    tok_emb = np.empty((vocab_size, 4))
    for i in range(10):
        x = i / 100
        a = (-x - 1 + (-3 * x ** 2 - 2 * x + 5) ** 0.5) / 2
        tok_emb[i] = np.array([x, 1, a, -x - a - 1])
    tok_emb[10] = tok_emb[11] = tok_emb[0]  # Anything normalized.
    tok_emb = np.pad(tok_emb, ((0, 0), (0, 3)))
    params['tok_emb'][:] = params['out_emb'][:] = tok_emb
    params['pos_emb'][:] = np.pad(pos_enc_3d_array(2 * n_digits + 2), ((0, 0), (4, 0)))

    out_range = 2 * (10 ** n_digits) - 1
    n_heads = 2 * n_digits + 1  # 1 cleanup head.
    layer0 = base_layer(n_heads, d_model, d_ff=3 * out_range, d_head=3)

    for head, (unit_pos, digit) in enumerate(product([1, n_digits + 2], range(n_digits))):
        theta = -(unit_pos + digit) * 2 * np.pi / block_size
        layer0['Q'][head, -3:, -3:] = pos_3d_enc_rotation_matrix(theta) * 1e6
        layer0['K'][head, -3:, -3:] = np.eye(3)
        layer0['V'][head, 0, 0] = 100 * (10 ** digit)
        layer0['P'][head, 0, 0] = 1

    # Cleanup head:
    layer0['Q'][-1, -3:, -3:] = (pos_3d_enc_rotation_matrix(0) * 1e6)
    layer0['K'][-1, -3:, -3:] = np.eye(3)
    layer0['V'][-1, 2:4, :2] = -np.eye(2)
    layer0['P'][-1, 2:4, :2] = np.eye(2)

    x0 = lambda i: np.concatenate((np.array([i, 1., 0, 0]), params['pos_emb'][0, -3:]))
    x0s = [layer_norm(x0(i))[0] for i in range(out_range)]
    a0s = [tok_emb[i % 10] * 1e10 for i in range(out_range)]
    layer0['M1'][0], layer0['b1'], layer0['M2'], layer0['b2'] = build_MLP(x0s, a0s)

    params['layers'].append(layer0)

    return params

In [1]:
def build_decimal_addition_transformer(n_digits):
    vocab_size = 10 + 2  # 10 digits, '+' (10) and '=' (11).
    block_size = 2 * n_digits + 2 + n_digits  # An extra n_digits for the result.
    d_model = 7  # 4 for token, 3 for position.
    params = base_params(vocab_size, block_size, d_model)

    tok_emb = np.empty((vocab_size, 4))
    for i in range(10):
        x = i / 100
        a = (-x - 1 + (-3 * x ** 2 - 2 * x + 5) ** 0.5) / 2
        tok_emb[i] = np.array([x, 1, a, -x - a - 1])
    tok_emb[10] = tok_emb[11] = tok_emb[0]  # Anything normalized.
    tok_emb = np.pad(tok_emb, ((0, 0), (0, 3)))
    params['tok_emb'][:] = params['out_emb'][:] = tok_emb
    params['pos_emb'][:] = np.pad(pos_enc_3d_array(3 * n_digits + 2), ((0, 0), (4, 0)))

    out_range = 2 * (10 ** n_digits) - 1
    n_heads = 2 * n_digits + 1  # 1 cleanup head.
    layer0 = base_layer(n_heads, d_model, d_ff=(n_digits + 1) * 3 * out_range, d_head=3)

    for head, (num_pos, digit) in enumerate(product([0, n_digits + 1], range(n_digits))):
        layer0['Q'][head, 1] = params['pos_emb'][num_pos + n_digits - 1 - digit, -3:] * 1e6
        layer0['K'][head, -3:, -3:] = np.eye(3)
        layer0['V'][head, 0, 0] = (10 ** digit) * 100
        layer0['P'][head, 0, 0] = 1 / 100

    # Cleanup and mix head:
    layer0['Q'][-1, -3:, -3:] = (pos_3d_enc_rotation_matrix(0) * 1e6)
    layer0['K'][-1, -3:, -3:] = np.eye(3)
    layer0['V'][-1, 2:4, :2] = -np.eye(2)
    layer0['P'][-1, 2:4, :2] = np.eye(2)
    layer0['V'][-1, 0, -1] = -1
    layer0['V'][-1, -3, -1] = 1
    layer0['P'][-1, 0, -1] = 1

    x0 = lambda i, p: np.concatenate((np.array([i / 100 + p[-3], 1., 0, 0]), p))
    x0s, a0s = [], []
    for i in range(out_range):
        for p in params['pos_emb'][-n_digits-1:, -3:]:
            x0s.append(layer_norm(x0(i, p))[0])
        for digit in range(n_digits, -1, -1):
            a0s.append(tok_emb[i // (10 ** digit) % 10] * 1e10)

    layer0['M1'][0], layer0['b1'], layer0['M2'], layer0['b2'] = build_MLP(x0s, a0s)

    params['layers'].append(layer0)

    return params

# Chapter 7 - The OG Encodings

In [35]:
def og_embedding(block_size, d):
    t = np.arange(block_size)[:, None] / (10_000 ** (2 * np.arange(d // 2) / d))[None, :]
    res = np.empty((block_size, d))
    res[:, 0::2] = np.sin(t)
    res[:, 1::2] = np.cos(t)
    return res

In [36]:
def rotation_encoding(q, thetas):
  return np.exp(np.arange(q)[:, None] * 1j * thetas[None, :]).view(dtype=np.float_)

def reversed_rotation_encoding(q, thetas):
  vs = np.exp(np.arange(q)[:, None] * 1j * thetas[None, :])
  res = np.empty((q, thetas.shape[0] * 2))
  res[:, 0::2] = vs.imag
  res[:, 1::2] = vs.real
  return res

In [37]:
def shift_by_delta(d, delta, theta):
  M = np.zeros((d, d))
  for i in range(d // 2):
    M[i * 2,     i * 2]     =  np.cos(theta[i] * delta)
    M[i * 2 + 1, i * 2]     = -np.sin(theta[i] * delta)
    M[i * 2,     i * 2 + 1] =  np.sin(theta[i] * delta)
    M[i * 2 + 1, i * 2 + 1] =  np.cos(theta[i] * delta)
  return M

In [39]:
og_angles = lambda d: 1 / (10_000 ** (2 * np.arange(d // 2) / d))

q = 64
d = 128
np.testing.assert_allclose(reversed_rotation_encoding(q, og_angles(d)), og_embedding(q, d))

In [40]:
def random_rotation_embedding(q, d):
    return rotation_encoding(q, thetas=np.random.uniform(0, 2 * np.pi, size=d // 2))

In [41]:
def standard_padding_dimension(encoding, epsilon=1e-3):
  return np.sum(np.max(encoding, axis=0) - np.min(encoding, axis=0) <= 2 * epsilon)

In [42]:
def random_normal_encoding(q, d):
    return np.random.normal(0, 1, (q, d))

def random_sign_encoding(q, d):
  return np.random.randint(0, 2, (q, d)) * 2 - 1

In [44]:
import itertools

all_bit_strings = lambda n: list(itertools.product([0, 1], repeat=n))

def hamming_embedding(r):
    n = 2 ** r - 1
    k = n - r
    bits = [x for x in all_bit_strings(r) if x != (0,) * r]
    bits = sorted(reversed(bits), key=lambda s: s.count(1))  # Put in standard form.
    H = np.array(bits).T
    A = H[:, r:]
    G = np.concatenate((np.eye(k), A.T), axis=1)
    bits = np.array(all_bit_strings(k))
    return (np.einsum('kn,vk->vn', G, bits) % 2) * 2 - 1.

# Chapter 8 - The TAOTP Interpreter

In [45]:
import ast

def compile_TAOTP(program: str):
    params = ast.literal_eval(program)
    if 'out_emb' not in params:
        params['out_emb'] = params['tok_emb']
    for emb in ['tok_emb', 'pos_emb', 'out_emb']:
        params[emb] = np.array(params[emb])
    for layer in params['layers']:
        for k, lst in layer.items():
            layer[k] = np.array(lst)
    return params