Issue
I have a TensorFlow model SavedModel
which includes saved_model.pb
and variables
folder. The preprocessing step has not been incorporated into this model that's why I need to do preprocessing(Tokenization etc) before feeding the data to the model for the prediction aspect.
I am looking for an approach that I can incorporate the preprocessing step into the model. I have seen examples here and here however they are image data.
Just to get an idea how the training part has been done, this is a portion of the code that we did training (if you need the implementation of the function I have used here, please let me know(I did not include it to make my question more understandable ))
Training:
processor = IntentProcessor(FLAGS.data_path, FLAGS.test_data_path,
FLAGS.test_proportion, FLAGS.seed, FLAGS.do_early_stopping)
bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
tokenizer = tokenization.FullTokenizer(
vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
run_config = tf.estimator.RunConfig(
model_dir=FLAGS.output_dir,
save_checkpoints_steps=FLAGS.save_checkpoints_steps)
train_examples = None
num_train_steps = None
num_warmup_steps = None
if FLAGS.do_train:
train_examples = processor.get_train_examples()
num_iter_per_epoch = int(len(train_examples) / FLAGS.train_batch_size)
num_train_steps = num_iter_per_epoch * FLAGS.num_train_epochs
num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
run_config = tf.estimator.RunConfig(
model_dir=FLAGS.output_dir,
save_checkpoints_steps=num_iter_per_epoch)
best_temperature = 1.0 # Initiate the best T value as 1.0 and will
# update this during the training
model_fn = model_fn_builder(
bert_config=bert_config,
num_labels=len(processor.le.classes_),
init_checkpoint=FLAGS.init_checkpoint,
learning_rate=FLAGS.learning_rate,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps,
best_temperature=best_temperature,
seed=FLAGS.seed)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config)
# add parameters by passing a prams variable
if FLAGS.do_train:
train_features = convert_examples_to_features(
train_examples, FLAGS.max_seq_length, tokenizer)
train_labels = processor.get_train_labels()
train_input_fn = input_fn_builder(
features=train_features,
is_training=True,
batch_size=FLAGS.train_batch_size,
seed=FLAGS.seed,
labels=train_labels
)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
And this is the preprocessing that I use for the training:
LABEL_LIST = ['negative', 'neutral', 'positive']
INTENT_MAP = {i: LABEL_LIST[i] for i in range(len(LABEL_LIST))}
BATCH_SIZE = 1
MAX_SEQ_LEN = 70
def convert_examples_to_features(texts, max_seq_length, tokenizer):
"""Loads a data file into a list of InputBatchs.
texts is the list of input text
"""
features = {}
input_ids_list = []
input_mask_list = []
segment_ids_list = []
for (ex_index, text) in enumerate(texts):
tokens_a = tokenizer.tokenize(str(text))
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# print(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
input_ids_list.append(input_ids)
input_mask_list.append(input_mask)
segment_ids_list.append(segment_ids)
features['input_ids'] = np.asanyarray(input_ids_list)
features['input_mask'] = np.asanyarray(input_mask_list)
features['segment_ids'] = np.asanyarray(segment_ids_list)
# tf.data.Dataset.from_tensor_slices needs to pass numpy array not
# tensor, or the tensor graph (shape) should match
return features
and inferencing would be like this:
def inference(texts,MODEL_DIR, VOCAB_FILE):
if not isinstance(texts, list):
texts = [texts]
tokenizer = FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
features = convert_examples_to_features(texts, MAX_SEQ_LEN, tokenizer)
predict_fn = predictor.from_saved_model(MODEL_DIR)
response = predict_fn(features)
#print(response)
return get_sentiment(response)
def preprocess(texts):
if not isinstance(texts, list):
texts = [texts]
tokenizer = FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=False)
features = convert_examples_to_features(texts, MAX_SEQ_LEN, tokenizer)
return features
def get_sentiment(response):
idx = response['intent'].tolist()
print(idx)
print(INTENT_MAP.get(idx[0]))
outputs = []
for i in range(0, len(idx)):
outputs.append({
"sentiment": INTENT_MAP.get(idx[i]),
"confidence": response['prob'][i][idx[i]]
})
return outputs
sentence = 'The movie is ok'
inference(sentence, args.model_path, args.vocab_path)
And this is the implementation of model_fn_builder
:
def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
num_train_steps, num_warmup_steps, best_temperature, seed):
"""Returns multi-intents `model_fn` closure for Estimator"""
def model_fn(features, labels, mode,
params): # pylint: disable=unused-argument
"""The `model_fn` for Estimator."""
tf.logging.info("*** Features ***")
for name in sorted(features.keys()):
tf.logging.info(
" name = %s, shape = %s" % (name, features[name].shape))
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
(total_loss, per_example_loss, logits) = create_intent_model(
bert_config, is_training, input_ids, input_mask, segment_ids,
labels, num_labels, mode, seed)
tvars = tf.trainable_variables()
initialized_variable_names = None
if init_checkpoint:
(assignment_map,
initialized_variable_names) = \
modeling.get_assignment_map_from_checkpoint(
tvars, init_checkpoint)
tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
tf.logging.info("**** Trainable Variables ****")
for var in tvars:
init_string = ""
if var.name in initialized_variable_names:
init_string = ", *INIT_FROM_CKPT*"
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
init_string)
output_spec = None
if mode == tf.estimator.ModeKeys.TRAIN:
train_op = optimization.create_optimizer(
total_loss, learning_rate, num_train_steps, num_warmup_steps)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
train_op=train_op)
elif mode == tf.estimator.ModeKeys.EVAL:
def metric_fn(per_example_loss, labels, logits):
predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
accuracy = tf.metrics.accuracy(labels, predictions)
loss = tf.metrics.mean(per_example_loss)
return {
"eval_accuracy": accuracy,
"eval_loss": loss
}
eval_metrics = metric_fn(per_example_loss, labels, logits)
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=total_loss,
eval_metric_ops=eval_metrics)
elif mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'intent': tf.argmax(logits, axis=-1, output_type=tf.int32),
'prob': tf.nn.softmax(logits / tf.constant(best_temperature)),
'logits': logits
}
output_spec = tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions)
return output_spec
return model_fn
And this is the implementation of create_intent_model
def create_intent_model(bert_config, is_training, input_ids, input_mask,
segment_ids,
labels, num_labels, mode, seed):
model = modeling.BertModel(
config=bert_config,
is_training=is_training,
input_ids=input_ids,
input_mask=input_mask,
token_type_ids=segment_ids,
use_one_hot_embeddings=False,
seed=seed
)
output_layer = model.get_pooled_output()
hidden_size = output_layer.shape[-1].value
with tf.variable_scope("loss"):
output_weights = tf.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02, seed=seed))
output_bias = tf.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
if is_training:
# I.e., 0.1 dropout
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9, seed=seed)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
loss = None
per_example_loss = None
if mode == tf.estimator.ModeKeys.TRAIN or mode == \
tf.estimator.ModeKeys.EVAL:
log_probs = tf.nn.log_softmax(logits, axis=-1)
one_hot_labels = tf.one_hot(labels, depth=num_labels,
dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs,
axis=-1)
loss = tf.reduce_mean(per_example_loss)
return loss, per_example_loss, logits
This is the list tensorflow
related libraries:
tensorboard==1.15.0
tensorflow-estimator==1.15.1
tensorflow-gpu==1.15.0
There is good documentation here, however, it uses Keras API. Plus, I don't know how can I incorporate preprocessing layer here even with the Keras API.
Again, my final goal is to incorporate the preprocessing step into the model building phase so that when I later load the model I directly pass the The movie is ok
to the model?
I just need the idea on how to incorporate a preprocessing layer into this code which is function based.
Thanks in advance~
Solution
You can use the TextVectorization
layer as follows. But to answer your question fully, I'd need to know what's in model_fn_builder()
function. I'll show how you can do this with Keras model building API.
class BertTextProcessor(tf.keras.layers.Layer):
def __init__(self, max_length):
super().__init__()
self.max_length = max_length
# Here I'm setting any preprocessing to none
# by default this layer lowers case and remove punctuation
# i.e. tokens like [CLS] would become cls
self.vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=max_length, standardize=None)
def call(self, inputs):
inputs = "[CLS] " + inputs + " [SEP]"
tok_inputs = self.vectorizer(inputs)
return {
"input_ids": tok_inputs,
"input_mask": tf.cast(tok_inputs != 0, 'int32'),
"segment_ids": tf.zeros_like(tok_inputs)
}
def adapt(self, data):
data = "[CLS] " + data + " [SEP]"
self.vectorizer.adapt(data)
def get_config(self):
return {
"max_length": self.max_length
}
Usage,
input_str = tf.constant(["movie is okay good plot very nice", "terrible movie bad actors not good"])
proc = BertTextProcessor(8)
# You need to call this so that the vectorizer layer learns the vocabulary
proc.adapt(input_str)
print(proc(input_str))
which outputs,
{'input_ids': <tf.Tensor: shape=(2, 10), dtype=int64, numpy=
array([[ 5, 2, 12, 9, 3, 8, 6, 11, 4, 0],
[ 5, 7, 2, 13, 14, 10, 3, 4, 0, 0]])>, 'input_mask': <tf.Tensor: shape=(2, 10), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(2, 10), dtype=int64, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>}
You can use this layer as an input for a Keras model as you would use any layer.
You can also get the vocabulary using, proc.vectorizer.get_vocabulary()
which returns,
['',
'[UNK]',
'movie',
'good',
'[SEP]',
'[CLS]',
'very',
'terrible',
'plot',
'okay',
'not',
'nice',
'is',
'bad',
'actors']
Alternative with tf-models-official
To get data in a format accepted by BERT, you can also use the tf-models-official
library. Specifically, you can use the BertPackInputs
object.
I recently updated code for one of my books and in Chapter 13/13.1_Spam_Classification you can see how it is used. The section Generating the correct input format for BERT shows how this could be done.
Answered By - thushv89
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.