I'm trying to obtain the model summary of Grounding DINO. I tried to use the torch-summary library to do this, but I'm facing problems with specifying the correct input size, thing that is mandatory to call the summarizing function.
Since Grounding DINO is a multi-modal model (it takes as inputs (image, text) pairs), I'm struggling trying to figure out what sort of input size, and with what format, should I pass to the summary function.
from groundingdino.util.inference import load_model
from torchsummary import summary
model = load_model(CONFIG_PATH, WEIGHTS_PATH)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
summary(model, input_size)
I tried to pass as input_size parameter:
- just image size (e.g. (3, 244, 244))
- a list containing image size and a text prompt (e.g. [(3, 244, 244), 'some text'])
- image size extended with another integer that might represent the length of the textual input (e.g. (3, 244, 244, 10))
- a list containing image size and an integer that might represent the length of the textual input (e.g. [(3, 244, 244), 10] and [(3, 244, 244), (10,)])
but all of the attempts resulted in an error. For example, the one occurred with the first attempt is:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/torchsummary/torchsummary.py in summary(model, input_data, batch_dim, branching, col_names, col_width, depth, device, dtypes, verbose, *args, **kwargs)
139 with torch.no_grad():
--> 140 _ = model.to(device)(*x, *args, **kwargs) # type: ignore[misc]
141 except Exception as e:
3 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
/content/GroundingDINO/groundingdino/models/GroundingDINO/groundingdino.py in forward(self, samples, targets, **kw)
242 if targets is None:
--> 243 captions = kw["captions"]
244 else:
KeyError: 'captions'
The above exception was the direct cause of the following exception:
RuntimeError Traceback (most recent call last)
<ipython-input-9-f3881fbb51d4> in <cell line: 9>()
7 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8 model = model.to(device)
----> 9 summary(model, (3, 224, 224))
/usr/local/lib/python3.10/dist-packages/torchsummary/torchsummary.py in summary(model, input_data, batch_dim, branching, col_names, col_width, depth, device, dtypes, verbose, *args, **kwargs)
141 except Exception as e:
142 executed_layers = [layer for layer in summary_list if layer.executed]
--> 143 raise RuntimeError(
144 "Failed to run torchsummary. See above stack traces for more details. "
145 "Executed layers up to: {}".format(executed_layers)
RuntimeError: Failed to run torchsummary. See above stack traces for more details. Executed layers up to: []
It seems like it expects some keyword input (captions). I tried to look at the predictions-related code in the github repo, as well as to the forward method of the model, but I still wasn't able to fix the problem.
The docs of torch-summary also state that it is possible to directly pass input data in place of input size and let the function infer what it needs to print the summary, so I tried the following:
from groundingdino.util.inference import load_image
image_source, image = load_image(IMG_PATH)
caption='some text'
summary(model, image, caption)
but it generated the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/torchsummary/torchsummary.py in summary(model, input_data, batch_dim, branching, col_names, col_width, depth, device, dtypes, verbose, *args, **kwargs)
139 with torch.no_grad():
--> 140 _ = model.to(device)(*x, *args, **kwargs) # type: ignore[misc]
141 except Exception as e:
4 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py in _call_impl(self, *args, **kwargs)
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
/content/GroundingDINO/groundingdino/models/GroundingDINO/groundingdino.py in forward(self, samples, targets, **kw)
244 else:
--> 245 captions = [t["caption"] for t in targets]
246
/content/GroundingDINO/groundingdino/models/GroundingDINO/groundingdino.py in <listcomp>(.0)
244 else:
--> 245 captions = [t["caption"] for t in targets]
246
TypeError: string indices must be integers
The above exception was the direct cause of the following exception:
RuntimeError Traceback (most recent call last)
<ipython-input-7-a5f3a38c6e5a> in <cell line: 9>()
7 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8 model = model.to(device)
----> 9 summary(model, image, TEXT_PROMPT)
/usr/local/lib/python3.10/dist-packages/torchsummary/torchsummary.py in summary(model, input_data, batch_dim, branching, col_names, col_width, depth, device, dtypes, verbose, *args, **kwargs)
141 except Exception as e:
142 executed_layers = [layer for layer in summary_list if layer.executed]
--> 143 raise RuntimeError(
144 "Failed to run torchsummary. See above stack traces for more details. "
145 "Executed layers up to: {}".format(executed_layers)
RuntimeError: Failed to run torchsummary. See above stack traces for more details. Executed layers up to: []
summary(model, {'image':image, 'captions':[caption]}) generated instead the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-10-5139855142c9> in <cell line: 9>()
7 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8 model = model.to(device)
----> 9 summary(model, {'image':image, 'captions':[caption]})
1 frames
/usr/local/lib/python3.10/dist-packages/torchsummary/torchsummary.py in summary(model, input_data, batch_dim, branching, col_names, col_width, depth, device, dtypes, verbose, *args, **kwargs)
134 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
135
--> 136 x, input_size = process_input_data(input_data, batch_dim, device, dtypes)
137 args, kwargs = set_device(args, device), set_device(kwargs, device)
138 try:
/usr/local/lib/python3.10/dist-packages/torchsummary/torchsummary.py in process_input_data(input_data, batch_dim, device, dtypes)
217
218 else:
--> 219 raise TypeError(
220 "Input type is not recognized. Please ensure input_data is valid.\n"
221 "For multiple inputs to the network, ensure input_data passed in is "
TypeError: Input type is not recognized. Please ensure input_data is valid.
For multiple inputs to the network, ensure input_data passed in is a sequence of tensors or a list of tuple sizes. If you are having trouble here, please submit a GitHub issue.
So, my question is, how can I find the proper input size and format to pass to the summary function? Or, more in general, how can I obtain the summary of such model? (Not necessarily using torch-summary, but I need the same information obtainable through this lib).
Thanks in advance to whoever will be able to help me with this problem.
P.S.
I'm not sure whether it can help, but here is the output of print(model):
GroundingDINO(
(transformer): Transformer(
(encoder): TransformerEncoder(
(layers): ModuleList(
(0-5): 6 x DeformableTransformerEncoderLayer(
(self_attn): MultiScaleDeformableAttention(
(sampling_offsets): Linear(in_features=256, out_features=256, bias=True)
(attention_weights): Linear(in_features=256, out_features=128, bias=True)
(value_proj): Linear(in_features=256, out_features=256, bias=True)
(output_proj): Linear(in_features=256, out_features=256, bias=True)
)
(dropout1): Dropout(p=0.0, inplace=False)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(linear1): Linear(in_features=256, out_features=2048, bias=True)
(dropout2): Dropout(p=0.0, inplace=False)
(linear2): Linear(in_features=2048, out_features=256, bias=True)
(dropout3): Dropout(p=0.0, inplace=False)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
)
(text_layers): ModuleList(
(0-5): 6 x TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
)
(linear1): Linear(in_features=256, out_features=1024, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(linear2): Linear(in_features=1024, out_features=256, bias=True)
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.0, inplace=False)
(dropout2): Dropout(p=0.0, inplace=False)
)
)
(fusion_layers): ModuleList(
(0-5): 6 x BiAttentionBlock(
(layer_norm_v): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(layer_norm_l): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(attn): BiMultiHeadAttention(
(v_proj): Linear(in_features=256, out_features=1024, bias=True)
(l_proj): Linear(in_features=256, out_features=1024, bias=True)
(values_v_proj): Linear(in_features=256, out_features=1024, bias=True)
(values_l_proj): Linear(in_features=256, out_features=1024, bias=True)
(out_v_proj): Linear(in_features=1024, out_features=256, bias=True)
(out_l_proj): Linear(in_features=1024, out_features=256, bias=True)
)
(drop_path): DropPath(drop_prob=0.100)
)
)
)
(decoder): TransformerDecoder(
(layers): ModuleList(
(0-5): 6 x DeformableTransformerDecoderLayer(
(cross_attn): MultiScaleDeformableAttention(
(sampling_offsets): Linear(in_features=256, out_features=256, bias=True)
(attention_weights): Linear(in_features=256, out_features=128, bias=True)
(value_proj): Linear(in_features=256, out_features=256, bias=True)
(output_proj): Linear(in_features=256, out_features=256, bias=True)
)
(dropout1): Identity()
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(ca_text): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
)
(catext_dropout): Identity()
(catext_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
)
(dropout2): Identity()
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(linear1): Linear(in_features=256, out_features=2048, bias=True)
(dropout3): Identity()
(linear2): Linear(in_features=2048, out_features=256, bias=True)
(dropout4): Identity()
(norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
)
)
(norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(ref_point_head): MLP(
(layers): ModuleList(
(0): Linear(in_features=512, out_features=256, bias=True)
(1): Linear(in_features=256, out_features=256, bias=True)
)
)
(bbox_embed): ModuleList(
(0-5): 6 x MLP(
(layers): ModuleList(
(0-1): 2 x Linear(in_features=256, out_features=256, bias=True)
(2): Linear(in_features=256, out_features=4, bias=True)
)
)
)
(class_embed): ModuleList(
(0-5): 6 x ContrastiveEmbed()
)
)
(tgt_embed): Embedding(900, 256)
(enc_output): Linear(in_features=256, out_features=256, bias=True)
(enc_output_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(enc_out_bbox_embed): MLP(
(layers): ModuleList(
(0-1): 2 x Linear(in_features=256, out_features=256, bias=True)
(2): Linear(in_features=256, out_features=4, bias=True)
)
)
(enc_out_class_embed): ContrastiveEmbed()
)
(bert): BertModelWarper(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0-11): 12 x BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
(feat_map): Linear(in_features=768, out_features=256, bias=True)
(input_proj): ModuleList(
(0): Sequential(
(0): Conv2d(192, 256, kernel_size=(1, 1), stride=(1, 1))
(1): GroupNorm(32, 256, eps=1e-05, affine=True)
)
(1): Sequential(
(0): Conv2d(384, 256, kernel_size=(1, 1), stride=(1, 1))
(1): GroupNorm(32, 256, eps=1e-05, affine=True)
)
(2): Sequential(
(0): Conv2d(768, 256, kernel_size=(1, 1), stride=(1, 1))
(1): GroupNorm(32, 256, eps=1e-05, affine=True)
)
(3): Sequential(
(0): Conv2d(768, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(1): GroupNorm(32, 256, eps=1e-05, affine=True)
)
)
(backbone): Joiner(
(0): SwinTransformer(
(patch_embed): PatchEmbed(
(proj): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
(norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
)
(pos_drop): Dropout(p=0.0, inplace=False)
(layers): ModuleList(
(0): BasicLayer(
(blocks): ModuleList(
(0): SwinTransformerBlock(
(norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=96, out_features=288, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=96, out_features=96, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): Identity()
(norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=96, out_features=384, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=384, out_features=96, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(1): SwinTransformerBlock(
(norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=96, out_features=288, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=96, out_features=96, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.018)
(norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=96, out_features=384, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=384, out_features=96, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
)
(downsample): PatchMerging(
(reduction): Linear(in_features=384, out_features=192, bias=False)
(norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
)
)
(1): BasicLayer(
(blocks): ModuleList(
(0): SwinTransformerBlock(
(norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=192, out_features=576, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=192, out_features=192, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.036)
(norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=192, out_features=768, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=768, out_features=192, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(1): SwinTransformerBlock(
(norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=192, out_features=576, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=192, out_features=192, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.055)
(norm2): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=192, out_features=768, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=768, out_features=192, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
)
(downsample): PatchMerging(
(reduction): Linear(in_features=768, out_features=384, bias=False)
(norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
(2): BasicLayer(
(blocks): ModuleList(
(0): SwinTransformerBlock(
(norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=384, out_features=1152, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=384, out_features=384, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.073)
(norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=1536, out_features=384, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(1): SwinTransformerBlock(
(norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=384, out_features=1152, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=384, out_features=384, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.091)
(norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=1536, out_features=384, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(2): SwinTransformerBlock(
(norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=384, out_features=1152, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=384, out_features=384, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.109)
(norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=1536, out_features=384, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(3): SwinTransformerBlock(
(norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=384, out_features=1152, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=384, out_features=384, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.127)
(norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=1536, out_features=384, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(4): SwinTransformerBlock(
(norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=384, out_features=1152, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=384, out_features=384, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.145)
(norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=1536, out_features=384, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(5): SwinTransformerBlock(
(norm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=384, out_features=1152, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=384, out_features=384, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.164)
(norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=384, out_features=1536, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=1536, out_features=384, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
)
(downsample): PatchMerging(
(reduction): Linear(in_features=1536, out_features=768, bias=False)
(norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
)
)
(3): BasicLayer(
(blocks): ModuleList(
(0): SwinTransformerBlock(
(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.182)
(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
(1): SwinTransformerBlock(
(norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=768, out_features=2304, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=768, out_features=768, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): DropPath(drop_prob=0.200)
(norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Mlp(
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(act): GELU(approximate='none')
(fc2): Linear(in_features=3072, out_features=768, bias=True)
(drop): Dropout(p=0.0, inplace=False)
)
)
)
)
)
(norm1): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): PositionEmbeddingSineHW()
)
(bbox_embed): ModuleList(
(0-5): 6 x MLP(
(layers): ModuleList(
(0-1): 2 x Linear(in_features=256, out_features=256, bias=True)
(2): Linear(in_features=256, out_features=4, bias=True)
)
)
)
(class_embed): ModuleList(
(0-5): 6 x ContrastiveEmbed()
)
)