(Building a lexer in Racket) How to identify & tokenize a line number that comes after a "gosub" statement

259 Views Asked by At

I am new to racket and am building a Lexer using the parser-tools/lex module and want to be able to tokenize a number that comes after a 'gosub' statement as a line number token. I am having trouble trying to figure out how to identify it as a line number, and not a regular number. I am reading in .txt files that look like this:

10 read A 20 read B 30 gosub 400 40 if C = 400 then write C 400 C = A + B : return $$

the "400" in "gosub 400" gets read as a number token.

I want this exepected output for my gosub token and line num token: ... token-GOSUB: GOSUB token-line-num: 400 ...

`#lang racket
;;; IMPORT
;; Import the lexer tools 
(require parser-tools/lex
         (prefix-in : parser-tools/lex-sre)  ; names from lex-sre are prefixed with :
         ;                                     to avoid name collisions
         )

;;; REGULAR EXPRESSIONS

;; Names for regular expressions matching letters and digits.
;; Note that :or are prefixed with a : due to (prefix-in : ...) above
(define-lex-abbrevs
  [read       "read"]
  [write      "write"]
  [goto       "goto"]
  [gosub       "gosub"]
  [line-num     (:/ #\1 #\9)]
  [letter     (:or (:/ "a" "z") (:/ #\A #\Z) "?" "!")]
  [digit        (:/ #\0 #\9)]
  [mult-op    (or "*" "/")]
  [add-op     (or "+" "-")]
  [end-of-file "$$"]
  [paren-start "("]
  [paren-end  ")"]
  )

;;; TOKENS

;; Tokens such as numbers (and identifiers and strings) carry a value
;; In the example only the NUMBER token is used, but you may need more.
(define-tokens value-tokens (NUMBER END-OF-PROGRAM READ WRITE GOTO GOSUB LINE-NUM IDENTIFIER MULT-OP ADD-OP PAREN-START PAREN-END))

;; Tokens that don't carry a value.
(define-empty-tokens op-tokens (newline :=  = < > ^ \( \) EOF))

;;; LEXER

;; The construct lexer-src-pos evaluates to a function which scans an input port
;; returning one position-token at a time.

;; A position token contains besides the actual token also source location information
;; (i.e. you can see where in the file the token was read)

(define lex
  (lexer-src-pos
    [(eof)                                          ; input: eof of file     
    'EOF]                                           ; output: the symbol EOF

    [(:+ end-of-file)
    (token-END-OF-PROGRAM (string->symbol lexeme))]

    [(:or #\tab #\space #\newline)                  ; input: whitespace
    (return-without-pos (lex input-port))]          ; output: the next token
   ;                                                  (i.e. skip the whitespace)

    ["\r"                                           ; input: newline
    (token-newline)]                                ; ouput: a newline-token   
   ;                                                ; note:  (token-newline) returns 'newline

    [(:or ":" ":=" "^" "<" ">" "=")                 ; input:  an operator
    (string->symbol lexeme)]                        ; output: corresponding symbol

    [(:or "+" "-")                                  ; input: "+" or "-"
    (token-ADD-OP (string->symbol lexeme))]         ; ouput: an ADD-OP token

    [(:or "*" "/")                                  ; input: "*" or "/"
    (token-MULT-OP (string->symbol lexeme))]        ; output: a MULT-OP token

    [(:+ digit)                                     ; input:  digits
    (token-NUMBER (string->number lexeme))]         ; outout: a NUMBER token whose value is the number

    [(:+ read)                                      ; input: the string "read"
    (token-READ lexeme)]                            ; output: READ token

    [(:+ write)                                     ; input: the string "write"
    (token-WRITE lexeme)]                           ; output: WRITE token

    [(:+ goto)                                      
    (token-GOTO lexeme)]
    
    [(:+ gosub)                                      ; match "gosub" followed by one or more digits
    (token-GOSUB lexeme)]


**    [(:+ line-num)                                      ; match "gosub" followed by one or more digits
    (token-LINE-NUM lexeme)] **
    

    [(:+ letter)                                    ; input: Alphabetic letter
    (token-IDENTIFIER lexeme)]                      ; output: IDENTIFIER token whose value is the word

    [(:+ paren-start)                               ; input: (
    (token-PAREN-START lexeme)]                     ; output: PAREN-START token 

    [(:+ paren-end)                                 ; input: )
    (token-PAREN-END lexeme)]                       ; output: PAREN-END token 
  ))



(define (string->tokens s)
  (port->tokens (open-input-file s)))

(define (port->tokens in)
  (define token (lex in))
  (if (eq? (position-token-token token) 'EOF)
      '()
      (cons token (port->tokens in))))

(provide string->tokens)`

I have tried used regexp, but am not really sure how it properly use it when working with tokens from the parser-tools/lex in racket. It did not result in anything and simply returned as a number again.

1

There are 1 best solutions below

1
Martin Půda On BEST ANSWER

I added a new abbreviation:

[gosub+ (concatenation "gosub " (repetition 0 +inf.0 digit))]

If this string is found, is parsed with its own function:

(define (gosub-tokens in)
  (let ((token (gosub-lex in)))
    (if (eq? (position-token-token token) 'EOF) '()
        (cons token (gosub-tokens in)))))

and its own lex:

(define gosub-lex
  (lexer-src-pos
   [(eof)                                         
    'EOF]
   [(:+ gosub)                                     
    (token-GOSUB lexeme)]
   [(:or #\tab #\space #\newline)                  
    (return-without-pos (gosub-lex input-port))]
   [(:+ digit)                                    
    (token-LINE-NUM (string->number lexeme))]))

The full code:

#lang racket
(require parser-tools/lex
         (prefix-in : parser-tools/lex-sre))

;;; REGULAR EXPRESSIONS

;; Names for regular expressions matching letters and digits.
;; Note that :or are prefixed with a : due to (prefix-in : ...) above
(define-lex-abbrevs
  [read       "read"]
  [write      "write"]
  [goto       "goto"]
  [digit      (:/ #\0 #\9)]
  [gosub      "gosub"]
  [gosub+     (concatenation "gosub " (repetition 0 +inf.0 digit))]
  [letter     (:or (:/ "a" "z") (:/ #\A #\Z) "?" "!")]
  [mult-op    (or "*" "/")]
  [add-op     (or "+" "-")]
  [end-of-file "$$"]
  [paren-start "("]
  [paren-end  ")"])

;;; TOKENS

(define-tokens value-tokens (NUMBER END-OF-PROGRAM READ WRITE GOTO GOSUB GOSUB+ LINE-NUM IDENTIFIER MULT-OP ADD-OP PAREN-START PAREN-END))

(define-empty-tokens op-tokens (newline :=  = < > ^ \( \) EOF))

;;; LEXER

(define gosub-lex
  (lexer-src-pos
   [(eof)                                         
    'EOF]
   [(:+ gosub)                                     
    (token-GOSUB lexeme)]
   [(:or #\tab #\space #\newline)                  
    (return-without-pos (gosub-lex input-port))]
   [(:+ digit)                                    
    (token-LINE-NUM (string->number lexeme))]))

(define lex
  (lexer-src-pos
    [(eof)                                         
    'EOF]                                           
    [(:+ end-of-file)
    (token-END-OF-PROGRAM (string->symbol lexeme))]
    [(:or #\tab #\space #\newline)                  
    (return-without-pos (lex input-port))]                                       
    ["\r"                                          
    (token-newline)]                               
    [(:or ":" ":=" "^" "<" ">" "=")                 
    (string->symbol lexeme)]                      
    [(:or "+" "-")                                 
    (token-ADD-OP (string->symbol lexeme))]        
    [(:or "*" "/")                                 
    (token-MULT-OP (string->symbol lexeme))]       
    [(:+ digit)                                    
    (token-NUMBER (string->number lexeme))]        
    [(:+ read)                                     
    (token-READ lexeme)]                           
    [(:+ write)                                    
    (token-WRITE lexeme)]                          
    [(:+ goto)                                      
    (token-GOTO lexeme)]
    [(:+ gosub+)                                     
    (token-GOSUB+ lexeme)]
    [(:+ letter)                                  
    (token-IDENTIFIER lexeme)]                     
    [(:+ paren-start)                              
    (token-PAREN-START lexeme)]                     
    [(:+ paren-end)                                
    (token-PAREN-END lexeme)]))                      

(define (string->tokens s)
  (port->tokens (open-input-file s)))

(define (gosub-tokens in)
  (let ((token (gosub-lex in)))
    (if (eq? (position-token-token token) 'EOF) '()
        (cons token (gosub-tokens in)))))
        
(define (port->tokens in)
  (let ((token (lex in)))
    (cond ((eq? (position-token-token token) 'EOF) '())
          ((eq? (token-name (position-token-token token)) 'GOSUB+)
           (append (gosub-tokens (open-input-string (token-value (position-token-token token))))
                   (port->tokens in)))
          (else (cons token (port->tokens in))))))

(provide string->tokens)