# -*- coding: utf-8 -*-
import json
import math
from random import shuffle
from typing import Dict, List
import numpy as np
from boltons.iterutils import chunked_iter
import tensorflow as tf
from langml.baselines import BaseDataLoader
from langml import TF_KERAS
if TF_KERAS:
from tensorflow.keras.preprocessing.sequence import pad_sequences
else:
from keras.preprocessing.sequence import pad_sequences
[docs]class DataLoader(BaseDataLoader):
def __init__(self,
data: List,
tokenizer: object,
label2id: Dict,
batch_size: int = 32,
is_bert: bool = True):
self.data = data
self.batch_size = batch_size
self.is_bert = is_bert
self.tokenizer = tokenizer
self.label2id = label2id
[docs] def __len__(self) -> int:
return math.ceil(len(self.data) / self.batch_size)
@staticmethod
[docs] def load_data(fpath: str, build_vocab: bool = False) -> List:
if build_vocab:
label2id = {}
data = []
with open(fpath, 'r', encoding='utf-8') as reader:
for line in reader:
line = line.strip()
if not line:
continue
obj = json.loads(line)
if build_vocab and obj['label'] not in label2id:
label2id[obj['label']] = len(label2id)
data.append((obj['text'], obj['label']))
if build_vocab:
return data, label2id
return data
[docs] def make_iter(self, random: bool = False):
if random:
shuffle(self.data)
for chunks in chunked_iter(self.data, self.batch_size):
batch_tokens, batch_segments, batch_labels = [], [], []
for text, label in chunks:
tokenized = self.tokenizer.encode(text)
batch_tokens.append(tokenized.ids)
batch_segments.append(tokenized.segment_ids)
batch_labels.append([self.label2id[label]])
batch_tokens = pad_sequences(batch_tokens, padding='post', truncating='post')
batch_segments = pad_sequences(batch_segments, padding='post', truncating='post')
batch_labels = np.array(batch_labels)
if self.is_bert:
yield [batch_tokens, batch_segments], batch_labels
else:
yield batch_tokens, batch_labels
[docs]class TFDataLoader(DataLoader):
[docs] def make_iter(self, random: bool = False):
def gen_features():
for text, label in self.data:
tokenized = self.tokenizer.encode(text)
if self.is_bert:
yield {'Input-Token': tokenized.ids,
'Input-Segment': tokenized.segment_ids}, [self.label2id[label]]
else:
yield tokenized.ids, [label]
if self.is_bert:
output_types = ({'Input-Token': tf.int64, 'Input-Segment': tf.int64}, tf.int64)
output_shapes = ({'Input-Token': tf.TensorShape((None, )),
'Input-Segment': tf.TensorShape((None, ))},
tf.TensorShape((1, )))
else:
output_types = (tf.int64, tf.int64)
output_shapes = (tf.TensorShape((None, )), tf.TensorShape((1, )))
dataset = tf.data.Dataset.from_generator(gen_features,
output_types=output_types,
output_shapes=output_shapes)
dataset = dataset.repeat()
if random:
dataset = dataset.shuffle(self.batch_size * 1000)
dataset = dataset.padded_batch(self.batch_size, output_shapes)
return dataset
[docs] def __call__(self, random: bool = False):
return self.make_iter(random=random)