Skip to content

Commit

Permalink
Implement tokenizer in Fortran (certik#34)
Browse files Browse the repository at this point in the history
* Initial BPE encoder implementation
* Support UTF-8
* Add encoder vocabulary pairs list into model.dat
* Store byte_encoder in model.dat, use it to compute byte_decoder
* Implement namelist based input file
* Update README and CI
  • Loading branch information
certik authored Mar 21, 2023
1 parent b9d2b28 commit ae21407
Show file tree
Hide file tree
Showing 9 changed files with 360 additions and 98 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 48,7 @@ enable_testing()
set(SRC
main.f90
gpt2.f90
tokenizer.f90
)
if (FASTGPT_BLAS STREQUAL "Accelerate")
list(APPEND SRC
Expand Down
47 changes: 19 additions & 28 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 54,10 @@ subsequent runs:

python create_model.py --models_dir "models" --model_size "124M"

Create an input file:
Now you can modify the `input` file to change the input string and set other
parameters.

python encode_input.py \
"Alan Turing theorized that computers would one day become very powerful, but even he could not imagine" \
-n 20

Run (requires `model.dat` and `input.dat` in the current directory):
Run (requires `model.dat` and `input` in the current directory):

./gpt2

Expand All @@ -70,46 67,40 @@ The above `./gpt2` command prints on Apple M1 Max:
```
$ ./gpt2
Loading the model...
done.
done. Time: 0.111s
Model parameters:
n_vocab = 50257
n_ctx = 1024
n_embd = 768
n_layer = 12
n_head = 12
Input text
Alan Turing theorized that computers would one day become very powerful, but even he could not imagine
Encoding: tokenizing input text into tokens (currently slow)...
done. Time: 0.074s
Input parameters:
n_seq = 19
n_tokens_to_generate = 20
Input tokens:
36235 39141 18765 1143 326 9061 561 530 1110 1716 845 3665 11 475 772 339 714 407 5967
Decoded input as text:
Alan Turing theorized that computers would one day become very powerful, but even he could not imagine
Running model...
1 703
2 484
3 561
4 307
5 1498
6 284
7 466
8 523
9 13
10 198
11 198
12 1
13 40
14 892
15 326
16 262
17 749
18 1593
19 1517
20 318
done. Time: 0.795s
how they would be able to do so.
"I think that the most important thing is
done. Time: 0.304s (1.01x)
Output tokens:
703 484 561 307 1498 284 466 523 13 198 198 1 40 892 326 262 749 1593 1517 318
Decoded output as text:
how they would be able to do so.
Expand Down
3 changes: 0 additions & 3 deletions ci/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 6,6 @@ cmake .
make
mkdir models
python create_model.py --models_dir "models" --model_size "124M"
python encode_input.py \
"Alan Turing theorized that computers would one day become very powerful, but even he could not imagine" \
-n 20
./gpt2

make clean
Expand Down
File renamed without changes.
29 changes: 21 additions & 8 deletions create_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 113,8 @@ def load_encoder_hparams_and_params(model_size, models_dir):

return hparams, params

def convert(params, n_head, n_ctx, idx, decoder_txt, byte_decoder):
def convert(params, n_head, n_ctx, idx, decoder_txt,
vocab_idx, vocab_txt, byte_decoder):
t1 = clock()
blocks = params["blocks"]
n_embd = blocks[0]["ln_1"]["b"].size
Expand Down Expand Up @@ -157,7 158,8 @@ def convert(params, n_head, n_ctx, idx, decoder_txt, byte_decoder):
# Save the model
f = open("model.dat", "w")
np.array([n_vocab, n_ctx, n_embd, n_layer, n_head,
len(idx),len(decoder_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32).tofile(f)
len(idx),len(decoder_txt.encode("utf-8")),
len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32).tofile(f)
wte.tofile(f); wpe.tofile(f)
mlp_fc_w.tofile(f); mlp_fc_b.tofile(f)
mlp_proj_w.tofile(f); mlp_proj_b.tofile(f)
Expand All @@ -168,6 170,8 @@ def convert(params, n_head, n_ctx, idx, decoder_txt, byte_decoder):
lnf_b.tofile(f); lnf_g.tofile(f)
idx.tofile(f)
f.write(decoder_txt)
vocab_idx.tofile(f)
f.write(vocab_txt)
byte_decoder.tofile(f)

t2 = clock()
Expand All @@ -185,6 189,12 @@ def load_decoder(filename):
i = 1
return decoder

def load_vocab(filename):
D = open(filename).read()
D = D.split("\n")
D = D[1:]
return D

def decoder_idx(decoder):
i = 0
idx = np.empty(len(decoder) 1, dtype=np.int32)
Expand All @@ -211,14 221,18 @@ def bytes_to_unicode():
for y in byte_decoder:
x = ord(y)
bd[x] = byte_decoder[y]
return bd
bd2 = np.zeros(256, dtype=np.int32)
for i in range(np.size(bd)):
bd2[bd[i]] = i
return bd2

def main(model_size: str = "124M", models_dir: str = "models"):
print("Loading model")
# load encoder, hparams, and params from the released open-ai gpt-2 files
t1 = clock()
hparams, params = load_encoder_hparams_and_params(model_size, models_dir)
decoder = load_decoder(os.path.join(models_dir, model_size, "encoder.json"))
vocab = load_vocab(os.path.join(models_dir, model_size, "vocab.bpe"))
t2 = clock()
print(" Done. Loading time: ", t2-t1)

Expand All @@ -227,14 241,13 @@ def main(model_size: str = "124M", models_dir: str = "models"):
t1 = clock()
decoder_txt = "".join(decoder)
idx = decoder_idx(decoder)
vocab_txt = "".join(vocab)
vocab_idx = decoder_idx(vocab)
byte_decoder = bytes_to_unicode()
convert(params, hparams["n_head"], hparams["n_ctx"], idx, decoder_txt, byte_decoder)
convert(params, hparams["n_head"], hparams["n_ctx"], idx, decoder_txt,
vocab_idx, vocab_txt, byte_decoder)
t2 = clock()
print(" Done. Time: ", t2-t1)
# TODO: This will not be needed once we have the encoder in Fortran:
print("Copying encoder.json and vocab.bpe into the current directory")
copyfile(os.path.join(models_dir, model_size, "encoder.json"), "encoder.json")
copyfile(os.path.join(models_dir, model_size, "vocab.bpe"), "vocab.bpe")


if __name__ == "__main__":
Expand Down
38 changes: 1 addition & 37 deletions gpt2.f90
Original file line number Diff line number Diff line change
@@ -1,5 1,6 @@
module gpt2_mod
use linalg, only: matmul_2d, matmul_2d_t
use tokenizer, only: decode
implicit none

integer, parameter :: sp = kind(0.0)
Expand Down Expand Up @@ -271,41 272,4 @@ function generate(n_tokens_to_generate, &
print *
end function

function c2s(x) result(y)
character, intent(in) :: x(:)
character(:), allocatable :: y
integer :: i
allocate(character(size(x)) :: y)
do i = 1, size(x)
y(i:i) = x(i)
end do
end function

function decode(tokens, idx, decoder_txt, byte_decoder) result(output)
integer, intent(in) :: tokens(:), idx(0:), byte_decoder(0:)
character, intent(in) :: decoder_txt(:)
character(:), allocatable :: output
character(:), allocatable :: output2, tmp
integer :: i, c, d
allocate(character(0) :: output2) ! Fix GFortran warning
output2 = ""
do i = 1, size(tokens)
output2 = output2 // c2s(decoder_txt(idx(tokens(i)) 1:idx(tokens(i) 1)))
end do
i = 1
output = ""
do
c = iachar(output2(i:i))
if (c >= 128) then
i = i 1
d = iachar(output2(i:i))
c = ior(ishft(iand(c, 31), 6), iand(d, 63))
end if
tmp = achar(byte_decoder(c))
output = output // tmp
if (i == len(output2)) exit
i = i 1
end do
end function

end module
4 changes: 4 additions & 0 deletions input
Original file line number Diff line number Diff line change
@@ -0,0 1,4 @@
&input_fastGPT
n_tokens_to_generate = 20
/
Alan Turing theorized that computers would one day become very powerful, but even he could not imagine
91 changes: 69 additions & 22 deletions main.f90
Original file line number Diff line number Diff line change
@@ -1,14 1,17 @@
program gpt2
use gpt2_mod, only: generate, decode
use gpt2_mod, only: generate
use tokenizer, only: encode, decode
use omp, only: omp_get_wtime
implicit none

integer, parameter :: sp = kind(0.0)
integer, parameter :: dp = kind(0.d0)

integer :: n_vocab, n_ctx, n_seq, n_embd, n_layer, n_head, &
n_tokens_to_generate, n_decoder_idx, n_decoder_txt, n_byte_decoder
integer, allocatable :: input(:), decoder_idx(:), byte_decoder(:)
n_tokens_to_generate, n_decoder_idx, n_decoder_txt, &
n_vocab_idx, n_vocab_txt, n_byte_encoder
integer, allocatable :: input(:), decoder_idx(:), vocab_idx(:), byte_decoder(:)
integer :: byte_encoder(0:255)
real(sp), allocatable :: wte(:,:), wpe(:,:), &
mlp_fc_w(:,:,:), mlp_fc_b(:,:), &
mlp_proj_w(:,:,:), mlp_proj_b(:,:), &
Expand All @@ -17,19 20,24 @@ program gpt2
ln1_b(:,:), ln1_g(:,:), &
ln2_b(:,:), ln2_g(:,:), &
lnf_b(:), lnf_g(:)
character, allocatable :: decoder_txt(:)
character, allocatable :: decoder_txt(:), vocab_txt(:)
integer, allocatable :: output(:)
character(:), allocatable :: output_txt
character(:), allocatable :: output_txt, input_txt
character(1024) :: input_txt2
real(dp) :: t1, t2, t1o, t2o
integer :: u
integer :: u, i, ios
logical :: use_cache
namelist / input_fastGPT / n_tokens_to_generate

! Load the model
print "(a)", "Loading the model..."
call cpu_time(t1)
open(newunit=u, file="model.dat", form="unformatted", access="stream", status="old")
!read(u) model_version
! fastGPT (digits look similar to the letters they represent)
! model_version /= 0xfa51697
read(u) n_vocab, n_ctx, n_embd, n_layer, n_head, n_decoder_idx, n_decoder_txt, &
n_byte_decoder
n_vocab_idx, n_vocab_txt, n_byte_encoder
allocate(wte(n_embd,n_vocab), wpe(n_embd,n_ctx), &
mlp_fc_w(4*n_embd,n_embd,n_layer), mlp_fc_b(4*n_embd,n_layer), &
mlp_proj_w(n_embd,4*n_embd,n_layer), mlp_proj_b(n_embd,n_layer), &
Expand All @@ -39,7 47,8 @@ program gpt2
ln2_b(n_embd,n_layer), ln2_g(n_embd,n_layer), &
lnf_b(n_embd), lnf_g(n_embd), &
decoder_idx(n_decoder_idx), decoder_txt(n_decoder_txt), &
byte_decoder(n_byte_decoder))
vocab_idx(n_vocab_idx), vocab_txt(n_vocab_txt))
if (n_byte_encoder /= 256) error stop "n_byte_encoder must be 256"
read(u) wte, wpe, &
mlp_fc_w, mlp_fc_b, &
mlp_proj_w, mlp_proj_b, &
Expand All @@ -48,40 57,77 @@ program gpt2
ln1_b, ln1_g, &
ln2_b, ln2_g, &
lnf_b, lnf_g, &
decoder_idx, decoder_txt, byte_decoder
decoder_idx, decoder_txt, &
vocab_idx, vocab_txt, &
byte_encoder
close(u)
call cpu_time(t2)
print "(a,f8.3,a)", " done. Time:", t2-t1, "s"

! Load the input
open(newunit=u, file="input.dat", form="unformatted", access="stream", status="old")
read(u) n_seq, n_tokens_to_generate
allocate(input(n_seq))
read(u) input
close(u)

print *
print "(a)", "Model parameters:"
print "(a,i6)", "n_vocab =", n_vocab
print "(a,i6)", "n_ctx =", n_ctx
print "(a,i6)", "n_embd =", n_embd
print "(a,i6)", "n_layer =", n_layer
print "(a,i6)", "n_head =", n_head
print *

! Compute byte_decoder:
allocate(byte_decoder(0:maxval(byte_encoder)))
byte_decoder = 0
do i = 0, size(byte_encoder)-1
byte_decoder(byte_encoder(i)) = i
end do

! Load the input
allocate(character(0) :: input_txt)
input_txt = ""
open(newunit=u, file="input", status="old")
read(u, input_fastGPT)
do
read(u, "(a)", iostat=ios) input_txt2
if (ios /= 0) exit
if (len(input_txt) > 0) input_txt = input_txt // char(10)
input_txt = input_txt // trim(input_txt2)
end do
close(u)
print "(a)", "Input text"
print "(a)", input_txt

print *
print "(a)", "Encoding: tokenizing input text into tokens (currently slow)..."
call cpu_time(t1)
input = encode(input_txt, decoder_idx, decoder_txt, vocab_idx, vocab_txt, &
byte_encoder)
call cpu_time(t2)
n_seq = size(input)
print "(a,f8.3,a)", " done. Time:", t2-t1, "s"
print *
print "(a)", "Input parameters:"
print "(a,i4)", "n_seq =", n_seq
print "(a,i4)", "n_tokens_to_generate =", n_tokens_to_generate
print *
print "(a)", "Input tokens:"
print "(1000(i6))", input
print *


if (n_seq n_tokens_to_generate >= n_ctx) then
print *, "The maximum sequence length of the model was surpassed."
print *, "Make the input and/or number of tokens to generate shorter."
error stop
end if

print *
print "(a)", "Input tokens:"
print "(1000(i6))", input
print "(a)", "Decoded input as text:"
print "(a)", decode(input, decoder_idx, decoder_txt, byte_decoder)
!print "(a)", decode(input, decoder_idx, decoder_txt, byte_decoder)
allocate(character(0) :: output_txt) ! Fix GFortran warning
output_txt = decode(input, decoder_idx, decoder_txt, byte_decoder)
print "(a)", output_txt
print *

if (input_txt /= output_txt) then
error stop "The decoded input text does not agree with the input text"
end if

allocate(output(n_tokens_to_generate))
print "(a)", "Running model..."
Expand All @@ -99,10 145,11 @@ program gpt2
t2o = omp_get_wtime()
call cpu_time(t2)
print "(a,f8.3,a,f4.2,a)", " done. Time:", t2o-t1o, "s (", (t2-t1)/(t2o-t1o), "x)"
print *
print "(a)", "Output tokens:"
print "(1000(i6))", output
allocate(character(0) :: output_txt) ! Fix GFortran warning
output_txt = decode(output, decoder_idx, decoder_txt, byte_decoder)
print *
print "(a)", "Decoded output as text:"
print "(a)", output_txt
end program
Loading

0 comments on commit ae21407

Please sign in to comment.