(require 2htdp/batch-io) |
(require racket/string) ; Just for string-split |
|
; A Vector (x) is a [List-of Numbers] |
|
|
(define spam-emails |
(list "buy these drugs" "drugs for sale" "how to buy buy buy")) |
|
(define ham-emails |
(list "how was your break?" "did you buy bread?")) |
|
; A Vocabulary is a [List-of Strings] |
(define VOCAB |
(list "bread" "break" "sale" "buy" "drugs" "for" "these" "you" "your" "how" "to")) |
|
|
; Add two vectors. Assumes equal length. |
; Vector Vector -> Vector |
(check-expect (vector+ (list 1 2 3) (list 4 5 6)) (list 5 7 9)) |
(define (vector+ v1 v2) |
(cond |
[(and (empty? v1) (empty? v2)) '()] |
[else (cons (+ (first v1) (first v2)) |
(vector+ (rest v1) (rest v2)))])) |
|
|
; Retrieves the index of a given word in |
; the provided vocabulary (list of words). |
; w2idx : String Vocabulary -> Maybe Number |
(check-expect (w2idx "break") 1) |
(check-expect (w2idx "eggs") #false) |
(define (w2idx w) |
(local [; ACCUMULATOR: just keep track of current index |
(define (w2idx/a vocab idx) |
(cond |
[(empty? vocab) #false] ; This word is not in our vocab |
[(string=? (first vocab) w) idx] |
[(cons? vocab) (w2idx/a (rest vocab) (add1 idx))]))] |
|
(w2idx/a VOCAB 0))) |
|
; Map a word to a "one-hot" vector encoding it. |
; word2vector : String -> Vector |
(check-expect (word2vector "bread") (list 1 0 0 0 0 0 0 0 0 0 0)) |
(check-expect (word2vector "to") (list 0 0 0 0 0 0 0 0 0 0 1)) |
(check-expect (word2vector "drugs") (list 0 0 0 0 1 0 0 0 0 0 0)) |
(check-expect (word2vector "eggs") (list 0 0 0 0 0 0 0 0 0 0 0)) |
(define (word2vector w) |
(local [; look-up the index for this word. |
(define w-idx (w2idx w))] |
|
(cond |
[(boolean? w-idx) (make-zero-vector (length VOCAB))] |
[else |
(append (make-zero-vector w-idx) (list 1) |
(make-zero-vector (sub1 (- (length VOCAB) w-idx))))]))) |
|
; Create a "zero vector" (vector w/all zero entries) |
; of the given size. |
; make-zero-vector : Number -> Vector |
(check-expect (make-zero-vector 10) (list 0 0 0 0 0 0 0 0 0 0)) |
(define (make-zero-vector size) |
(build-list size (lambda (whatever) 0))) |
|
|
; Create a vector representation of a given text, in particular |
; using the "bag of words" encoding. |
; text2vector : String -> Vector |
(check-expect (text2vector "bread drugs") (list 1 0 0 0 1 0 0 0 0 0 0)) |
(define (text2vector s) |
(local [ |
(define init-s-vec (make-zero-vector (length VOCAB))) |
; Make a list of word vectors for each word in the string. |
(define list-of-word-vecs (map word2vector (string-split s)))] |
|
(foldr vector+ (make-zero-vector (length VOCAB)) list-of-word-vecs))) |
|
|
; A Label (y) is one of: |
; 1 |
; -1 |
|
(define-struct instance [x y]) |
; An Instance is a (make-instance Vector Label) |
|
(define (strs-to-instances strs label) |
(local [; Create an instance using a given |
; String and the provided label. |
(define (str-to-instance s) |
(make-instance (text2vector s) label))] |
|
(map str-to-instance strs))) |
|
(define spam-instances (strs-to-instances spam-emails 1)) |
(define ham-instances (strs-to-instances ham-emails -1)) |