| (require 2htdp/batch-io) |
| (require racket/string) ; Just for string-split |
| |
| ; A Vector (x) is a [List-of Numbers] |
| |
| |
| (define spam-emails |
| (list "buy these drugs" "drugs for sale" "how to buy buy buy")) |
| |
| (define ham-emails |
| (list "how was your break?" "did you buy bread?")) |
| |
| ; A Vocabulary is a [List-of Strings] |
| (define VOCAB |
| (list "bread" "break" "sale" "buy" "drugs" "for" "these" "you" "your" "how" "to")) |
| |
| |
| ; Add two vectors. Assumes equal length. |
| ; Vector Vector -> Vector |
| (check-expect (vector+ (list 1 2 3) (list 4 5 6)) (list 5 7 9)) |
| (define (vector+ v1 v2) |
| (cond |
| [(and (empty? v1) (empty? v2)) '()] |
| [else (cons (+ (first v1) (first v2)) |
| (vector+ (rest v1) (rest v2)))])) |
| |
| |
| ; Retrieves the index of a given word in |
| ; the provided vocabulary (list of words). |
| ; w2idx : String Vocabulary -> Maybe Number |
| (check-expect (w2idx "break") 1) |
| (check-expect (w2idx "eggs") #false) |
| (define (w2idx w) |
| (local [; ACCUMULATOR: just keep track of current index |
| (define (w2idx/a vocab idx) |
| (cond |
| [(empty? vocab) #false] ; This word is not in our vocab |
| [(string=? (first vocab) w) idx] |
| [(cons? vocab) (w2idx/a (rest vocab) (add1 idx))]))] |
| |
| (w2idx/a VOCAB 0))) |
| |
| ; Map a word to a "one-hot" vector encoding it. |
| ; word2vector : String -> Vector |
| (check-expect (word2vector "bread") (list 1 0 0 0 0 0 0 0 0 0 0)) |
| (check-expect (word2vector "to") (list 0 0 0 0 0 0 0 0 0 0 1)) |
| (check-expect (word2vector "drugs") (list 0 0 0 0 1 0 0 0 0 0 0)) |
| (check-expect (word2vector "eggs") (list 0 0 0 0 0 0 0 0 0 0 0)) |
| (define (word2vector w) |
| (local [; look-up the index for this word. |
| (define w-idx (w2idx w))] |
| |
| (cond |
| [(boolean? w-idx) (make-zero-vector (length VOCAB))] |
| [else |
| (append (make-zero-vector w-idx) (list 1) |
| (make-zero-vector (sub1 (- (length VOCAB) w-idx))))]))) |
| |
| ; Create a "zero vector" (vector w/all zero entries) |
| ; of the given size. |
| ; make-zero-vector : Number -> Vector |
| (check-expect (make-zero-vector 10) (list 0 0 0 0 0 0 0 0 0 0)) |
| (define (make-zero-vector size) |
| (build-list size (lambda (whatever) 0))) |
| |
| |
| ; Create a vector representation of a given text, in particular |
| ; using the "bag of words" encoding. |
| ; text2vector : String -> Vector |
| (check-expect (text2vector "bread drugs") (list 1 0 0 0 1 0 0 0 0 0 0)) |
| (define (text2vector s) |
| (local [ |
| (define init-s-vec (make-zero-vector (length VOCAB))) |
| ; Make a list of word vectors for each word in the string. |
| (define list-of-word-vecs (map word2vector (string-split s)))] |
| |
| (foldr vector+ (make-zero-vector (length VOCAB)) list-of-word-vecs))) |
| |
| |
| ; A Label (y) is one of: |
| ; 1 |
| ; -1 |
| |
| (define-struct instance [x y]) |
| ; An Instance is a (make-instance Vector Label) |
| |
| (define (strs-to-instances strs label) |
| (local [; Create an instance using a given |
| ; String and the provided label. |
| (define (str-to-instance s) |
| (make-instance (text2vector s) label))] |
| |
| (map str-to-instance strs))) |
| |
| (define spam-instances (strs-to-instances spam-emails 1)) |
| (define ham-instances (strs-to-instances ham-emails -1)) |