forked from tensorflow/models
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_utils.py
119 lines (95 loc) · 4.14 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for parsing Kaggle baby names files."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import numpy as np
import tensorflow as tf
import pandas as pd
# the default end of name rep will be zero
_EON = 0
def read_names(names_path):
"""read data from downloaded file. See SmallNames.txt for example format
or go to https://www.kaggle.com/kaggle/us-baby-names for full lists
Args:
names_path: path to the csv file similar to the example type
Returns:
Dataset: a namedtuple of two elements: deduped names and their associated
counts. The names contain only 26 chars and are all lower case
"""
names_data = pd.read_csv(names_path)
names_data.Name = names_data.Name.str.lower()
name_data = names_data.groupby(by=["Name"])["Count"].sum()
name_counts = np.array(name_data.tolist())
names_deduped = np.array(name_data.index.tolist())
Dataset = collections.namedtuple('Dataset', ['Name', 'Count'])
return Dataset(names_deduped, name_counts)
def _letter_to_number(letter):
"""converts letters to numbers between 1 and 27"""
# ord of lower case 'a' is 97
return ord(letter) - 96
def namignizer_iterator(names, counts, batch_size, num_steps, epoch_size):
"""Takes a list of names and counts like those output from read_names, and
makes an iterator yielding a batch_size by num_steps array of random names
separated by an end of name token. The names are chosen randomly according
to their counts. The batch may end mid-name
Args:
names: a set of lowercase names composed of 26 characters
counts: a list of the frequency of those names
batch_size: int
num_steps: int
epoch_size: number of batches to yield
Yields:
(x, y): a batch_size by num_steps array of ints representing letters, where
x will be the input and y will be the target
"""
name_distribution = counts / counts.sum()
for i in range(epoch_size):
data = np.zeros(batch_size * num_steps + 1)
samples = np.random.choice(names, size=batch_size * num_steps // 2,
replace=True, p=name_distribution)
data_index = 0
for sample in samples:
if data_index >= batch_size * num_steps:
break
for letter in map(_letter_to_number, sample) + [_EON]:
if data_index >= batch_size * num_steps:
break
data[data_index] = letter
data_index += 1
x = data[:batch_size * num_steps].reshape((batch_size, num_steps))
y = data[1:batch_size * num_steps + 1].reshape((batch_size, num_steps))
yield (x, y)
def name_to_batch(name, batch_size, num_steps):
""" Takes a single name and fills a batch with it
Args:
name: lowercase composed of 26 characters
batch_size: int
num_steps: int
Returns:
x, y: a batch_size by num_steps array of ints representing letters, where
x will be the input and y will be the target. The array is filled up
to the length of the string, the rest is filled with zeros
"""
data = np.zeros(batch_size * num_steps + 1)
data_index = 0
for letter in map(_letter_to_number, name) + [_EON]:
data[data_index] = letter
data_index += 1
x = data[:batch_size * num_steps].reshape((batch_size, num_steps))
y = data[1:batch_size * num_steps + 1].reshape((batch_size, num_steps))
return x, y