Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not use test and validation datasets whilst building the vocabulary #179

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions scripts/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,33 @@
if __name__ == '__main__':
if args.encoding == 'bytes': args.encoding = None

# First go the file once to see how big it is and to build the vocab
token_to_idx = {}
# First go the file once to see how big it is
total_size = 0
with codecs.open(args.input_txt, 'r', args.encoding) as f:
for line in f:
total_size += len(line)
for char in line:
if char not in token_to_idx:
token_to_idx[char] = len(token_to_idx) + 1

# Now we can figure out the split sizes
val_size = int(args.val_frac * total_size)
test_size = int(args.test_frac * total_size)
train_size = total_size - val_size - test_size


# Scan the fist N lines of the file in order to build the vocab
token_to_idx = {}
with codecs.open(args.input_txt, 'r', args.encoding) as f:
cur_idx = 0
for line in f:
for char in line:
cur_idx += 1
if cur_idx <= train_size:
if char not in token_to_idx:
token_to_idx[char] = len(token_to_idx) + 1
else:
break # break our of the nested loop
else:
continue # executed if the loop ended normally
break # executed if 'continue' was skipped

if not args.quiet:
print 'Total vocabulary size: %d' % len(token_to_idx)
print 'Total tokens in file: %d' % total_size
Expand All @@ -61,7 +73,8 @@
with codecs.open(args.input_txt, 'r', args.encoding) as f:
for line in f:
for char in line:
splits[split_idx][cur_idx] = token_to_idx[char]
if char in token_to_idx:
splits[split_idx][cur_idx] = token_to_idx[char]
cur_idx += 1
if cur_idx == splits[split_idx].size:
split_idx += 1
Expand Down