Open In Colab

How to convert text to One Hot Vector?

A tutorial of converting text

import numpy as np
sentence = """Thomas Jefferson began building Monticello at the
...   age of 26."""
token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
', '.join(vocab)
'26., Jefferson, Monticello, Thomas, age, at, began, building, of, the'
vocab_size = len(vocab)
num_tokens = len(token_sequence)
onehot_vectors = np.zeros((num_tokens, vocab_size), int)
for i, word in enumerate(token_sequence):
  onehot_vectors[i, vocab.index(word)] = 1
onehot_vectors
array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
import pandas as pd
pd.DataFrame(onehot_vectors, columns=vocab)
26. Jefferson Monticello Thomas age at began building of the
0 0 0 0 1 0 0 0 0 0 0
1 0 1 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 1 0 0 0
3 0 0 0 0 0 0 0 1 0 0
4 0 0 1 0 0 0 0 0 0 0
5 0 0 0 0 0 1 0 0 0 0
6 0 0 0 0 0 0 0 0 0 1
7 0 0 0 0 1 0 0 0 0 0
8 0 0 0 0 0 0 0 0 1 0
9 1 0 0 0 0 0 0 0 0 0