Getting `ValueError: as_list() is not defined on an unknown TensorShape.` when trying to tokenize as part of the model

43 Views Asked by At

I am trying to do tokenization as part of my model, as it will reduce my CPU usage, and RAM, on the other hand, it will utilize my GPU more. But I am facing an issue saying ValueError: as_list() is not defined on an unknown TensorShape.

I have created a Layer called TokenizationLayer which takes care of the tokenization, and defines as:

class TokenizationLayer(Layer):
    def __init__(self, max_length, **kwargs):
        super(TokenizationLayer, self).__init__(**kwargs)
        self.max_length = max_length
        self.tokenizer = Tokenizer()

    def build(self, input_shape):
        super(TokenizationLayer, self).build(input_shape)

    def tokenize_sequences(self, x):
        # Tokenization function
        return self.tokenizer.texts_to_sequences([x.numpy()])[0]

    def call(self, inputs):
        # Use tf.py_function to apply tokenization element-wise
        sequences = tf.map_fn(lambda x: tf.py_function(self.tokenize_sequences, [x], tf.int32), inputs, dtype=tf.int32)
        # Masking step
        mask = tf.math.logical_not(tf.math.equal(sequences, 0))
        return tf.where(mask, sequences, -1)  # Using -1 as a mask value

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.max_length)  # Use self.max_length instead of trying to access shape

But it keeps giving me an error saying as_list() is not defined on an unknown TensorShape.

Here is the complete code, if you need it:

import tensorflow as tf
from tensorflow.keras.layers import Layer, Input, Embedding, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

class TokenizationLayer(Layer):
    def __init__(self, max_length, **kwargs):
        super(TokenizationLayer, self).__init__(**kwargs)
        self.max_length = max_length
        self.tokenizer = Tokenizer()

    def build(self, input_shape):
        super(TokenizationLayer, self).build(input_shape)

    def tokenize_sequences(self, x):
        # Tokenization function
        return self.tokenizer.texts_to_sequences([x.numpy()])[0]

    def call(self, inputs):
        # Use tf.py_function to apply tokenization element-wise
        sequences = tf.map_fn(lambda x: tf.py_function(self.tokenize_sequences, [x], tf.int32), inputs, dtype=tf.int32)
        # Masking step
        mask = tf.math.logical_not(tf.math.equal(sequences, 0))
        return tf.where(mask, sequences, -1)  # Using -1 as a mask value

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.max_length)  # Use self.max_length instead of trying to access shape

# Build the model with the custom tokenization layer
def build_model(vocab_size, max_length):
    input1 = Input(shape=(1,), dtype=tf.string)
    input2 = Input(shape=(1,), dtype=tf.string)

    # Tokenization layer
    tokenization_layer = TokenizationLayer(max_length)
    embedded_seq1 = tokenization_layer(input1)
    embedded_seq2 = tokenization_layer(input2)

    # Embedding layer for encoding strings
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length)

    # Encode first string
    lstm_out1 = LSTM(64)(embedding_layer(embedded_seq1))

    # Encode second string
    lstm_out2 = LSTM(64)(embedding_layer(embedded_seq2))

    # Concatenate outputs
    concatenated = Concatenate()([lstm_out1, lstm_out2])

    # Dense layer for final output
    output = Dense(1, activation='relu')(concatenated)

    # Build model
    model = Model(inputs=[input1, input2], outputs=output)
    return model

string1 = "hello world"
string2 = "foo bar baz"

max_length = max(len(string1.split()), len(string2.split()))

model = build_model(vocab_size=1000, max_length=max_length)
model.summary()

labels = tf.random.normal((1, 5))
model.compile(optimizer='adam', loss='mse')
model.fit([tf.constant([string1]), tf.constant([string2])], labels, epochs=10, batch_size=1, validation_split=0.2)

Here is the full stack-trace:

WARNING:tensorflow:From /usr/local/lib/python3.10/dist-packages/tensorflow/python/util/deprecation.py:660: calling map_fn_v2 (from tensorflow.python.ops.map_fn) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Use fn_output_signature instead
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-1-23051fe36790> in <cell line: 64>()
     62 max_length = max(len(string1.split()), len(string2.split()))
     63 
---> 64 model = build_model(vocab_size=1000, max_length=max_length)
     65 model.summary()
     66 

2 frames
<ipython-input-1-23051fe36790> in build_model(vocab_size, max_length)
     42 
     43     # Encode first string
---> 44     lstm_out1 = LSTM(64)(embedding_layer(embedded_seq1))
     45 
     46     # Encode second string

/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py in error_handler(*args, **kwargs)
     68             # To get the full stack trace, call:
     69             # `tf.debugging.disable_traceback_filtering()`
---> 70             raise e.with_traceback(filtered_tb) from None
     71         finally:
     72             del filtered_tb

/usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/tensor_shape.py in as_list(self)
   1438     """
   1439     if self._dims is None:
-> 1440       raise ValueError("as_list() is not defined on an unknown TensorShape.")
   1441     return list(self._dims)
   1442 

ValueError: as_list() is not defined on an unknown TensorShape.
0

There are 0 best solutions below