Implement tokenizer in Fortran (certik#34)

* Initial BPE encoder implementation * Support UTF-8 * Add encoder vocabulary pairs list into model.dat * Store byte_encoder in model.dat, use it to compute byte_decoder * Implement namelist based input file * Update README and CI
matteo-grella · Mar 21, 2023 · ae21407 · ae21407
1 parent b9d2b28
commit ae21407
Show file tree

Hide file tree

Showing 9 changed files with 360 additions and 98 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -48,6  48,7 @@ enable_testing()
 set(SRC
     main.f90
     gpt2.f90
     tokenizer.f90
     )
 if (FASTGPT_BLAS STREQUAL "Accelerate")
     list(APPEND SRC

diff --git a/README.md b/README.md
@@ -54,13  54,10 @@ subsequent runs:
 
     python create_model.py --models_dir "models" --model_size "124M"
 
-Create an input file:
 Now you can modify the `input` file to change the input string and set other
 parameters.
 
-    python encode_input.py \
-        "Alan Turing theorized that computers would one day become very powerful, but even he could not imagine" \
-        -n 20
-
-Run (requires `model.dat` and `input.dat` in the current directory):
 Run (requires `model.dat` and `input` in the current directory):
 
     ./gpt2
 
@@ -70,46  67,40 @@ The above `./gpt2` command prints on Apple M1 Max:
 ```
 $ ./gpt2
 Loading the model...
-    done.
     done. Time:   0.111s
 
 Model parameters:
 n_vocab = 50257
 n_ctx   =  1024
 n_embd  =   768
 n_layer =    12
 n_head  =    12
 
 Input text
 Alan Turing theorized that computers would one day become very powerful, but even he could not imagine
 
 Encoding: tokenizing input text into tokens (currently slow)...
     done. Time:   0.074s
 
 Input parameters:
 n_seq                =  19
 n_tokens_to_generate =  20
 
 Input tokens:
  36235 39141 18765  1143   326  9061   561   530  1110  1716   845  3665    11   475   772   339   714   407  5967
 
 Decoded input as text:
 Alan Turing theorized that computers would one day become very powerful, but even he could not imagine
 
 Running model...
-           1         703
-           2         484
-           3         561
-           4         307
-           5        1498
-           6         284
-           7         466
-           8         523
-           9          13
-          10         198
-          11         198
-          12           1
-          13          40
-          14         892
-          15         326
-          16         262
-          17         749
-          18        1593
-          19        1517
-          20         318
-    done. Time:   0.795s
  how they would be able to do so.
 
 "I think that the most important thing is
     done. Time:   0.304s (1.01x)
 
 Output tokens:
    703   484   561   307  1498   284   466   523    13   198   198     1    40   892   326   262   749  1593  1517   318
 
 Decoded output as text:
  how they would be able to do so.
 

diff --git a/ci/build.sh b/ci/build.sh
@@ -6,9  6,6 @@ cmake .
 make
 mkdir models
 python create_model.py --models_dir "models" --model_size "124M"
-python encode_input.py \
-    "Alan Turing theorized that computers would one day become very powerful, but even he could not imagine" \
-    -n 20
 ./gpt2
 
 make clean

diff --git a/encode_input.py → comparison/encode_input.py b/encode_input.py → comparison/encode_input.py
diff --git a/create_model.py b/create_model.py
@@ -113,7  113,8 @@ def load_encoder_hparams_and_params(model_size, models_dir):
 
     return hparams, params
 
-def convert(params, n_head, n_ctx, idx, decoder_txt, byte_decoder):
 def convert(params, n_head, n_ctx, idx, decoder_txt,
         vocab_idx, vocab_txt, byte_decoder):
     t1 = clock()
     blocks = params["blocks"]
     n_embd = blocks[0]["ln_1"]["b"].size
@@ -157,7  158,8 @@ def convert(params, n_head, n_ctx, idx, decoder_txt, byte_decoder):
     # Save the model
     f = open("model.dat", "w")
     np.array([n_vocab, n_ctx, n_embd, n_layer, n_head,
-        len(idx),len(decoder_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32).tofile(f)
         len(idx),len(decoder_txt.encode("utf-8")),
         len(vocab_idx),len(vocab_txt.encode("utf-8")),len(byte_decoder)], dtype=np.int32).tofile(f)
     wte.tofile(f); wpe.tofile(f)
     mlp_fc_w.tofile(f); mlp_fc_b.tofile(f)
     mlp_proj_w.tofile(f); mlp_proj_b.tofile(f)
@@ -168,6  170,8 @@ def convert(params, n_head, n_ctx, idx, decoder_txt, byte_decoder):
     lnf_b.tofile(f); lnf_g.tofile(f)
     idx.tofile(f)
     f.write(decoder_txt)
     vocab_idx.tofile(f)
     f.write(vocab_txt)
     byte_decoder.tofile(f)
 
     t2 = clock()
@@ -185,6  189,12 @@ def load_decoder(filename):
         i  = 1
     return decoder
 
 def load_vocab(filename):
     D = open(filename).read()
     D = D.split("\n")
     D = D[1:]
     return D
 
 def decoder_idx(decoder):
     i = 0
     idx = np.empty(len(decoder) 1, dtype=np.int32)
@@ -211,14  221,18 @@ def bytes_to_unicode():
     for y in byte_decoder:
         x = ord(y)
         bd[x] = byte_decoder[y]
-    return bd
     bd2 = np.zeros(256, dtype=np.int32)
     for i in range(np.size(bd)):
         bd2[bd[i]] = i
     return bd2
 
 def main(model_size: str = "124M", models_dir: str = "models"):
     print("Loading model")
     # load encoder, hparams, and params from the released open-ai gpt-2 files
     t1 = clock()
     hparams, params = load_encoder_hparams_and_params(model_size, models_dir)
     decoder = load_decoder(os.path.join(models_dir, model_size, "encoder.json"))
     vocab = load_vocab(os.path.join(models_dir, model_size, "vocab.bpe"))
     t2 = clock()
     print("  Done. Loading time: ", t2-t1)
 
@@ -227,14  241,13 @@ def main(model_size: str = "124M", models_dir: str = "models"):
     t1 = clock()
     decoder_txt = "".join(decoder)
     idx = decoder_idx(decoder)
     vocab_txt = "".join(vocab)
     vocab_idx = decoder_idx(vocab)
     byte_decoder = bytes_to_unicode()
-    convert(params, hparams["n_head"], hparams["n_ctx"], idx, decoder_txt, byte_decoder)
     convert(params, hparams["n_head"], hparams["n_ctx"], idx, decoder_txt,
             vocab_idx, vocab_txt, byte_decoder)
     t2 = clock()
     print("  Done. Time: ", t2-t1)
-    # TODO: This will not be needed once we have the encoder in Fortran:
-    print("Copying encoder.json and vocab.bpe into the current directory")
-    copyfile(os.path.join(models_dir, model_size, "encoder.json"), "encoder.json")
-    copyfile(os.path.join(models_dir, model_size, "vocab.bpe"), "vocab.bpe")
 
 
 if __name__ == "__main__":

diff --git a/gpt2.f90 b/gpt2.f90
@@ -1,5  1,6 @@
 module gpt2_mod
 use linalg, only: matmul_2d, matmul_2d_t
 use tokenizer, only: decode
 implicit none
 
 integer, parameter :: sp = kind(0.0)
@@ -271,41  272,4 @@ function generate(n_tokens_to_generate, &
 print *
 end function
 
-function c2s(x) result(y)
-character, intent(in) :: x(:)
-character(:), allocatable :: y
-integer :: i
-allocate(character(size(x)) :: y)
-do i = 1, size(x)
-    y(i:i) = x(i)
-end do
-end function
-
-function decode(tokens, idx, decoder_txt, byte_decoder) result(output)
-integer, intent(in) :: tokens(:), idx(0:), byte_decoder(0:)
-character, intent(in) :: decoder_txt(:)
-character(:), allocatable :: output
-character(:), allocatable :: output2, tmp
-integer :: i, c, d
-allocate(character(0) :: output2) ! Fix GFortran warning
-output2 = ""
-do i = 1, size(tokens)
-    output2 = output2 // c2s(decoder_txt(idx(tokens(i)) 1:idx(tokens(i) 1)))
-end do
-i = 1
-output = ""
-do
-    c = iachar(output2(i:i))
-    if (c >= 128) then
-        i = i   1
-        d = iachar(output2(i:i))
-        c = ior(ishft(iand(c, 31), 6), iand(d, 63))
-    end if
-    tmp = achar(byte_decoder(c))
-    output = output // tmp
-    if (i == len(output2)) exit
-    i = i   1
-end do
-end function
-
 end module
diff --git a/input b/input
@@ -0,0  1,4 @@
 &input_fastGPT
 n_tokens_to_generate = 20
 /
 Alan Turing theorized that computers would one day become very powerful, but even he could not imagine
diff --git a/main.f90 b/main.f90
@@ -1,14  1,17 @@
 program gpt2
-use gpt2_mod, only: generate, decode
 use gpt2_mod, only: generate
 use tokenizer, only: encode, decode
 use omp, only: omp_get_wtime
 implicit none
 
 integer, parameter :: sp = kind(0.0)
 integer, parameter :: dp = kind(0.d0)
 
 integer :: n_vocab, n_ctx, n_seq, n_embd, n_layer, n_head, &
-    n_tokens_to_generate, n_decoder_idx, n_decoder_txt, n_byte_decoder
-integer, allocatable :: input(:), decoder_idx(:), byte_decoder(:)
     n_tokens_to_generate, n_decoder_idx, n_decoder_txt, &
     n_vocab_idx, n_vocab_txt, n_byte_encoder
 integer, allocatable :: input(:), decoder_idx(:), vocab_idx(:), byte_decoder(:)
 integer :: byte_encoder(0:255)
 real(sp), allocatable :: wte(:,:), wpe(:,:), &
     mlp_fc_w(:,:,:), mlp_fc_b(:,:), &
     mlp_proj_w(:,:,:), mlp_proj_b(:,:), &
@@ -17,19  20,24 @@ program gpt2
     ln1_b(:,:), ln1_g(:,:), &
     ln2_b(:,:), ln2_g(:,:), &
     lnf_b(:), lnf_g(:)
-character, allocatable :: decoder_txt(:)
 character, allocatable :: decoder_txt(:), vocab_txt(:)
 integer, allocatable :: output(:)
-character(:), allocatable :: output_txt
 character(:), allocatable :: output_txt, input_txt
 character(1024) :: input_txt2
 real(dp) :: t1, t2, t1o, t2o
-integer :: u
 integer :: u, i, ios
 logical :: use_cache
 namelist / input_fastGPT / n_tokens_to_generate
 
 ! Load the model
 print "(a)", "Loading the model..."
 call cpu_time(t1)
 open(newunit=u, file="model.dat", form="unformatted", access="stream", status="old")
 !read(u) model_version
 !                    fastGPT (digits look similar to the letters they represent)
 ! model_version /= 0xfa51697
 read(u) n_vocab, n_ctx, n_embd, n_layer, n_head, n_decoder_idx, n_decoder_txt, &
-    n_byte_decoder
     n_vocab_idx, n_vocab_txt, n_byte_encoder
 allocate(wte(n_embd,n_vocab), wpe(n_embd,n_ctx), &
     mlp_fc_w(4*n_embd,n_embd,n_layer), mlp_fc_b(4*n_embd,n_layer), &
     mlp_proj_w(n_embd,4*n_embd,n_layer), mlp_proj_b(n_embd,n_layer), &
@@ -39,7  47,8 @@ program gpt2
     ln2_b(n_embd,n_layer), ln2_g(n_embd,n_layer), &
     lnf_b(n_embd), lnf_g(n_embd), &
     decoder_idx(n_decoder_idx), decoder_txt(n_decoder_txt), &
-    byte_decoder(n_byte_decoder))
     vocab_idx(n_vocab_idx), vocab_txt(n_vocab_txt))
 if (n_byte_encoder /= 256) error stop "n_byte_encoder must be 256"
 read(u) wte, wpe, &
     mlp_fc_w, mlp_fc_b, &
     mlp_proj_w, mlp_proj_b, &
@@ -48,40  57,77 @@ program gpt2
     ln1_b, ln1_g, &
     ln2_b, ln2_g, &
     lnf_b, lnf_g, &
-    decoder_idx, decoder_txt, byte_decoder
     decoder_idx, decoder_txt, &
     vocab_idx, vocab_txt, &
     byte_encoder
 close(u)
 call cpu_time(t2)
 print "(a,f8.3,a)", "    done. Time:", t2-t1, "s"
-
-! Load the input
-open(newunit=u, file="input.dat", form="unformatted", access="stream", status="old")
-read(u) n_seq, n_tokens_to_generate
-allocate(input(n_seq))
-read(u) input
-close(u)
-
 print *
 print "(a)", "Model parameters:"
 print "(a,i6)", "n_vocab =", n_vocab
 print "(a,i6)", "n_ctx   =", n_ctx
 print "(a,i6)", "n_embd  =", n_embd
 print "(a,i6)", "n_layer =", n_layer
 print "(a,i6)", "n_head  =", n_head
 print *
 
 ! Compute byte_decoder:
 allocate(byte_decoder(0:maxval(byte_encoder)))
 byte_decoder = 0
 do i = 0, size(byte_encoder)-1
     byte_decoder(byte_encoder(i)) = i
 end do
 
 ! Load the input
 allocate(character(0) :: input_txt)
 input_txt = ""
 open(newunit=u, file="input", status="old")
 read(u, input_fastGPT)
 do
     read(u, "(a)", iostat=ios) input_txt2
     if (ios /= 0) exit
     if (len(input_txt) > 0) input_txt = input_txt // char(10)
     input_txt = input_txt // trim(input_txt2)
 end do
 close(u)
 print "(a)", "Input text"
 print "(a)", input_txt
 
 print *
 print "(a)",  "Encoding: tokenizing input text into tokens (currently slow)..."
 call cpu_time(t1)
 input = encode(input_txt, decoder_idx, decoder_txt, vocab_idx, vocab_txt, &
     byte_encoder)
 call cpu_time(t2)
 n_seq = size(input)
 print "(a,f8.3,a)", "    done. Time:", t2-t1, "s"
 print *
 print "(a)", "Input parameters:"
 print "(a,i4)", "n_seq                =", n_seq
 print "(a,i4)", "n_tokens_to_generate =", n_tokens_to_generate
 print *
 print "(a)", "Input tokens:"
 print "(1000(i6))", input
 print *
 
 
 if (n_seq   n_tokens_to_generate >= n_ctx) then
     print *, "The maximum sequence length of the model was surpassed."
     print *, "Make the input and/or number of tokens to generate shorter."
     error stop
 end if
 
-print *
-print "(a)", "Input tokens:"
-print "(1000(i6))", input
 print "(a)", "Decoded input as text:"
-print "(a)", decode(input, decoder_idx, decoder_txt, byte_decoder)
 !print "(a)", decode(input, decoder_idx, decoder_txt, byte_decoder)
 allocate(character(0) :: output_txt) ! Fix GFortran warning
 output_txt = decode(input, decoder_idx, decoder_txt, byte_decoder)
 print "(a)", output_txt
 print *
 
 if (input_txt /= output_txt) then
     error stop "The decoded input text does not agree with the input text"
 end if
 
 allocate(output(n_tokens_to_generate))
 print "(a)", "Running model..."
@@ -99,10  145,11 @@ program gpt2
 t2o = omp_get_wtime()
 call cpu_time(t2)
 print "(a,f8.3,a,f4.2,a)", "    done. Time:", t2o-t1o, "s (", (t2-t1)/(t2o-t1o), "x)"
 print *
 print "(a)", "Output tokens:"
 print "(1000(i6))", output
-allocate(character(0) :: output_txt) ! Fix GFortran warning
 output_txt = decode(output, decoder_idx, decoder_txt, byte_decoder)
 print *
 print "(a)", "Decoded output as text:"
 print "(a)", output_txt
 end program