From bbba4f66baa467431c14c138d40b4bde8361a59d Mon Sep 17 00:00:00 2001 From: sahahner Date: Mon, 2 Dec 2024 15:29:53 +0000 Subject: [PATCH 1/7] positional-embedding-hidden-grid --- src/anemoi/models/layers/block.py | 11 +++++++++++ src/anemoi/models/layers/chunk.py | 2 ++ src/anemoi/models/layers/processor.py | 5 +++++ src/anemoi/models/models/encoder_processor_decoder.py | 10 ++++++++++ 4 files changed, 28 insertions(+) diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index 60446d6c..f232a042 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -68,6 +68,7 @@ def __init__( num_heads: int, activation: str, window_size: int, + grid_lat_coslon_sinlon: Tensor = None, dropout_p: float = 0.0, ): super().__init__() @@ -80,6 +81,11 @@ def __init__( self.layer_norm1 = nn.LayerNorm(num_channels) + self.grid_lat_coslon_sinlon = grid_lat_coslon_sinlon + if self.grid_lat_coslon_sinlon is not None: + self.grid_lat_coslon_sinlon = self.grid_lat_coslon_sinlon + self.pos_embedder = nn.Linear(3, num_channels) # assuming that we have 3 position features, lat and cos / sin of lon + self.attention = MultiHeadSelfAttention( num_heads=num_heads, embed_dim=num_channels, @@ -99,6 +105,11 @@ def __init__( def forward( self, x: Tensor, shapes: list, batch_size: int, model_comm_group: Optional[ProcessGroup] = None ) -> Tensor: + if self.grid_lat_coslon_sinlon is not None: + pos_embedding = self.pos_embedder(self.grid_lat_coslon_sinlon.to(x.device)) + pos_embedding = pos_embedding.repeat(batch_size, 1) + x = x + pos_embedding + # Need to be out of place for gradient propagation x = x + self.attention(self.layer_norm1(x), shapes, batch_size, model_comm_group=model_comm_group) x = x + self.mlp(self.layer_norm2(x)) diff --git a/src/anemoi/models/layers/chunk.py b/src/anemoi/models/layers/chunk.py index 5c4fae38..6a33ae9b 100644 --- a/src/anemoi/models/layers/chunk.py +++ b/src/anemoi/models/layers/chunk.py @@ -74,6 +74,7 @@ def __init__( num_heads: int = 16, mlp_hidden_ratio: int = 4, activation: str = "GELU", + grid_lat_coslon_sinlon: Tensor = None, dropout_p: float = 0.0, ) -> None: """Initialize TransformerProcessor. @@ -102,6 +103,7 @@ def __init__( num_heads=num_heads, activation=activation, window_size=window_size, + grid_lat_coslon_sinlon=grid_lat_coslon_sinlon, dropout_p=dropout_p, ) diff --git a/src/anemoi/models/layers/processor.py b/src/anemoi/models/layers/processor.py index 4fd32311..1573bc76 100644 --- a/src/anemoi/models/layers/processor.py +++ b/src/anemoi/models/layers/processor.py @@ -40,6 +40,7 @@ def __init__( num_chunks: int = 2, activation: str = "GELU", cpu_offload: bool = False, + grid_lat_coslon_sinlon: Tensor = None, **kwargs, ) -> None: """Initialize BaseProcessor.""" @@ -49,6 +50,7 @@ def __init__( self.num_chunks = num_chunks self.num_channels = num_channels self.chunk_size = num_layers // num_chunks + self.grid_lat_coslon_sinlon = grid_lat_coslon_sinlon assert ( num_layers % num_chunks == 0 @@ -94,6 +96,7 @@ def __init__( num_chunks: int = 2, activation: str = "GELU", cpu_offload: bool = False, + grid_lat_coslon_sinlon: Tensor = None, num_heads: int = 16, mlp_hidden_ratio: int = 4, dropout_p: float = 0.1, @@ -125,6 +128,7 @@ def __init__( num_chunks=num_chunks, activation=activation, cpu_offload=cpu_offload, + grid_lat_coslon_sinlon=grid_lat_coslon_sinlon, num_heads=num_heads, mlp_hidden_ratio=mlp_hidden_ratio, ) @@ -137,6 +141,7 @@ def __init__( num_layers=self.chunk_size, window_size=window_size, activation=activation, + grid_lat_coslon_sinlon=grid_lat_coslon_sinlon, dropout_p=dropout_p, ) diff --git a/src/anemoi/models/models/encoder_processor_decoder.py b/src/anemoi/models/models/encoder_processor_decoder.py index c67c8c03..ab5d3753 100644 --- a/src/anemoi/models/models/encoder_processor_decoder.py +++ b/src/anemoi/models/models/encoder_processor_decoder.py @@ -76,11 +76,21 @@ def __init__( dst_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden], ) + latlons_hidden = self.node_attributes.get_coordinates(self._graph_name_hidden) + lat_coslon_sinlon_hidden = torch.cat( # lat, cos(lon), sin(lon) for hidden grid points + ( latlons_hidden[:, 0].unsqueeze(-1), + torch.cos(latlons_hidden[:, 1].unsqueeze(-1)), + torch.sin(latlons_hidden[:, 1].unsqueeze(-1)), + ), + dim=-1, + ) + # Processor hidden -> hidden self.processor = instantiate( model_config.model.processor, num_channels=self.num_channels, sub_graph=self._graph_data[(self._graph_name_hidden, "to", self._graph_name_hidden)], + grid_lat_coslon_sinlon = lat_coslon_sinlon_hidden, src_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden], dst_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden], ) From 89e88b45665bbb99dec69c34fd4146dd62f5121a Mon Sep 17 00:00:00 2001 From: sahahner Date: Thu, 5 Dec 2024 13:31:39 +0000 Subject: [PATCH 2/7] duplicate line and positions as buffer --- src/anemoi/models/layers/block.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index f232a042..1b65b269 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -81,9 +81,8 @@ def __init__( self.layer_norm1 = nn.LayerNorm(num_channels) - self.grid_lat_coslon_sinlon = grid_lat_coslon_sinlon + self.register_buffer("grid_lat_coslon_sinlon", grid_lat_coslon_sinlon) if self.grid_lat_coslon_sinlon is not None: - self.grid_lat_coslon_sinlon = self.grid_lat_coslon_sinlon self.pos_embedder = nn.Linear(3, num_channels) # assuming that we have 3 position features, lat and cos / sin of lon self.attention = MultiHeadSelfAttention( @@ -106,7 +105,7 @@ def forward( self, x: Tensor, shapes: list, batch_size: int, model_comm_group: Optional[ProcessGroup] = None ) -> Tensor: if self.grid_lat_coslon_sinlon is not None: - pos_embedding = self.pos_embedder(self.grid_lat_coslon_sinlon.to(x.device)) + pos_embedding = self.pos_embedder(self.grid_lat_coslon_sinlon) pos_embedding = pos_embedding.repeat(batch_size, 1) x = x + pos_embedding From 0385744224406ebc360956a755b1d281ae5a1e4f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:38:10 +0000 Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/anemoi/models/layers/block.py | 6 ++++-- src/anemoi/models/models/encoder_processor_decoder.py | 7 ++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index 1b65b269..e8a60964 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -83,7 +83,9 @@ def __init__( self.register_buffer("grid_lat_coslon_sinlon", grid_lat_coslon_sinlon) if self.grid_lat_coslon_sinlon is not None: - self.pos_embedder = nn.Linear(3, num_channels) # assuming that we have 3 position features, lat and cos / sin of lon + self.pos_embedder = nn.Linear( + 3, num_channels + ) # assuming that we have 3 position features, lat and cos / sin of lon self.attention = MultiHeadSelfAttention( num_heads=num_heads, @@ -108,7 +110,7 @@ def forward( pos_embedding = self.pos_embedder(self.grid_lat_coslon_sinlon) pos_embedding = pos_embedding.repeat(batch_size, 1) x = x + pos_embedding - + # Need to be out of place for gradient propagation x = x + self.attention(self.layer_norm1(x), shapes, batch_size, model_comm_group=model_comm_group) x = x + self.mlp(self.layer_norm2(x)) diff --git a/src/anemoi/models/models/encoder_processor_decoder.py b/src/anemoi/models/models/encoder_processor_decoder.py index ab5d3753..33ce347b 100644 --- a/src/anemoi/models/models/encoder_processor_decoder.py +++ b/src/anemoi/models/models/encoder_processor_decoder.py @@ -77,8 +77,9 @@ def __init__( ) latlons_hidden = self.node_attributes.get_coordinates(self._graph_name_hidden) - lat_coslon_sinlon_hidden = torch.cat( # lat, cos(lon), sin(lon) for hidden grid points - ( latlons_hidden[:, 0].unsqueeze(-1), + lat_coslon_sinlon_hidden = torch.cat( # lat, cos(lon), sin(lon) for hidden grid points + ( + latlons_hidden[:, 0].unsqueeze(-1), torch.cos(latlons_hidden[:, 1].unsqueeze(-1)), torch.sin(latlons_hidden[:, 1].unsqueeze(-1)), ), @@ -90,7 +91,7 @@ def __init__( model_config.model.processor, num_channels=self.num_channels, sub_graph=self._graph_data[(self._graph_name_hidden, "to", self._graph_name_hidden)], - grid_lat_coslon_sinlon = lat_coslon_sinlon_hidden, + grid_lat_coslon_sinlon=lat_coslon_sinlon_hidden, src_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden], dst_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden], ) From df258416c87f88a7cf7d90076c241ea26a6a3dc3 Mon Sep 17 00:00:00 2001 From: sahahner Date: Mon, 16 Dec 2024 14:15:00 +0000 Subject: [PATCH 4/7] resolve comments --- src/anemoi/models/layers/block.py | 4 ++-- src/anemoi/models/layers/chunk.py | 2 +- src/anemoi/models/layers/processor.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index f1dc1f33..15e7b44f 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -68,7 +68,7 @@ def __init__( num_heads: int, activation: str, window_size: int, - grid_lat_coslon_sinlon: Tensor = None, + grid_lat_coslon_sinlon: Optional[Tensor] = None, dropout_p: float = 0.0, ): super().__init__() @@ -84,7 +84,7 @@ def __init__( self.register_buffer("grid_lat_coslon_sinlon", grid_lat_coslon_sinlon) if self.grid_lat_coslon_sinlon is not None: self.pos_embedder = nn.Linear( - 3, num_channels + self.grid_lat_coslon_sinlon.shape[-1], num_channels ) # assuming that we have 3 position features, lat and cos / sin of lon self.attention = MultiHeadSelfAttention( diff --git a/src/anemoi/models/layers/chunk.py b/src/anemoi/models/layers/chunk.py index 6a33ae9b..68e9f9f8 100644 --- a/src/anemoi/models/layers/chunk.py +++ b/src/anemoi/models/layers/chunk.py @@ -74,7 +74,7 @@ def __init__( num_heads: int = 16, mlp_hidden_ratio: int = 4, activation: str = "GELU", - grid_lat_coslon_sinlon: Tensor = None, + grid_lat_coslon_sinlon: Optional[Tensor] = None, dropout_p: float = 0.0, ) -> None: """Initialize TransformerProcessor. diff --git a/src/anemoi/models/layers/processor.py b/src/anemoi/models/layers/processor.py index 76d526f2..53b07fbc 100644 --- a/src/anemoi/models/layers/processor.py +++ b/src/anemoi/models/layers/processor.py @@ -40,7 +40,7 @@ def __init__( num_chunks: int = 2, activation: str = "GELU", cpu_offload: bool = False, - grid_lat_coslon_sinlon: Tensor = None, + grid_lat_coslon_sinlon: Optional[Tensor] = None, **kwargs, ) -> None: """Initialize BaseProcessor.""" @@ -96,7 +96,7 @@ def __init__( num_chunks: int = 2, activation: str = "GELU", cpu_offload: bool = False, - grid_lat_coslon_sinlon: Tensor = None, + grid_lat_coslon_sinlon: Optional[Tensor] = None, num_heads: int = 16, mlp_hidden_ratio: int = 4, dropout_p: float = 0.1, From 8b8b9ebacf3da2ff72aba8eb3a725ce4fbe29f8d Mon Sep 17 00:00:00 2001 From: sahahner Date: Mon, 16 Dec 2024 15:31:11 +0000 Subject: [PATCH 5/7] hydra instantiation for positional encoding implementing different functions --- src/anemoi/models/layers/block.py | 12 ++-- src/anemoi/models/layers/chunk.py | 4 +- .../models/layers/positionalencoding.py | 59 +++++++++++++++++++ src/anemoi/models/layers/processor.py | 10 ++-- .../models/encoder_processor_decoder.py | 20 +++---- 5 files changed, 82 insertions(+), 23 deletions(-) create mode 100644 src/anemoi/models/layers/positionalencoding.py diff --git a/src/anemoi/models/layers/block.py b/src/anemoi/models/layers/block.py index 15e7b44f..6b04199d 100644 --- a/src/anemoi/models/layers/block.py +++ b/src/anemoi/models/layers/block.py @@ -68,7 +68,7 @@ def __init__( num_heads: int, activation: str, window_size: int, - grid_lat_coslon_sinlon: Optional[Tensor] = None, + positional_encoding_hidden: Optional[Tensor] = None, dropout_p: float = 0.0, ): super().__init__() @@ -81,10 +81,10 @@ def __init__( self.layer_norm1 = nn.LayerNorm(num_channels) - self.register_buffer("grid_lat_coslon_sinlon", grid_lat_coslon_sinlon) - if self.grid_lat_coslon_sinlon is not None: + self.register_buffer("positional_encoding_hidden", positional_encoding_hidden) + if self.positional_encoding_hidden is not None: self.pos_embedder = nn.Linear( - self.grid_lat_coslon_sinlon.shape[-1], num_channels + self.positional_encoding_hidden.shape[-1], num_channels ) # assuming that we have 3 position features, lat and cos / sin of lon self.attention = MultiHeadSelfAttention( @@ -106,8 +106,8 @@ def __init__( def forward( self, x: Tensor, shapes: list, batch_size: int, model_comm_group: Optional[ProcessGroup] = None ) -> Tensor: - if self.grid_lat_coslon_sinlon is not None: - pos_embedding = self.pos_embedder(self.grid_lat_coslon_sinlon) + if self.positional_encoding_hidden is not None: + pos_embedding = self.pos_embedder(self.positional_encoding_hidden) pos_embedding = pos_embedding.repeat(batch_size, 1) x = x + pos_embedding diff --git a/src/anemoi/models/layers/chunk.py b/src/anemoi/models/layers/chunk.py index 68e9f9f8..67e1481c 100644 --- a/src/anemoi/models/layers/chunk.py +++ b/src/anemoi/models/layers/chunk.py @@ -74,7 +74,7 @@ def __init__( num_heads: int = 16, mlp_hidden_ratio: int = 4, activation: str = "GELU", - grid_lat_coslon_sinlon: Optional[Tensor] = None, + positional_encoding_hidden: Optional[Tensor] = None, dropout_p: float = 0.0, ) -> None: """Initialize TransformerProcessor. @@ -103,7 +103,7 @@ def __init__( num_heads=num_heads, activation=activation, window_size=window_size, - grid_lat_coslon_sinlon=grid_lat_coslon_sinlon, + positional_encoding_hidden=positional_encoding_hidden, dropout_p=dropout_p, ) diff --git a/src/anemoi/models/layers/positionalencoding.py b/src/anemoi/models/layers/positionalencoding.py new file mode 100644 index 00000000..867603c1 --- /dev/null +++ b/src/anemoi/models/layers/positionalencoding.py @@ -0,0 +1,59 @@ +from abc import ABC +from abc import abstractmethod + +import torch +from torch import Tensor + + +class BasePositionalEncoding(ABC): + """Configurable method calcuating positional encodings for latlons of a grid. + + To enable the positional encoding add the following to the model-config file and + chose the corresponding positional-encoding-class: + ``` + positional_encoding: + _target_: anemoi.models.layers.positionalencoding.CosSinLatCosSinLon + _convert_: all + ``` + If the entry positional_encoding does not exist or is None, no positional encoding is used. + + """ + + def __init__(self) -> None: + """Initialise Function for calculating the positional encodings.""" + + @abstractmethod + def positional_encoding(self, latlons_hidden: Tensor) -> Tensor: ... + + +class LatCosSinLon(BasePositionalEncoding): + """Lat, cos(lon), sin(lon) for grid points.""" + + def positional_encoding(self, latlons_hidden: Tensor) -> Tensor: + """Output lat, cos(lon), sin(lon) for grid points.""" + lat_coslon_sinlon_hidden = torch.cat( # lat, cos(lon), sin(lon) for hidden grid points + ( + latlons_hidden[:, 0].unsqueeze(-1), + torch.cos(latlons_hidden[:, 1].unsqueeze(-1)), + torch.sin(latlons_hidden[:, 1].unsqueeze(-1)), + ), + dim=-1, + ) + return lat_coslon_sinlon_hidden + + +class CosSinLatCosSinLon(BasePositionalEncoding): + """Cos(lat), sin(lat), cos(lon), sin(lon) for grid points.""" + + def positional_encoding(self, latlons_hidden: Tensor) -> Tensor: + """Output cos(lat), sin(lat), cos(lon), sin(lon) for grid points.""" + coslat_sinlat_coslon_sinlon_hidden = torch.cat( + ( + torch.cos(latlons_hidden[:, 0].unsqueeze(-1)), + torch.sin(latlons_hidden[:, 0].unsqueeze(-1)), + torch.cos(latlons_hidden[:, 1].unsqueeze(-1)), + torch.sin(latlons_hidden[:, 1].unsqueeze(-1)), + ), + dim=-1, + ) + return coslat_sinlat_coslon_sinlon_hidden diff --git a/src/anemoi/models/layers/processor.py b/src/anemoi/models/layers/processor.py index 53b07fbc..24aa3026 100644 --- a/src/anemoi/models/layers/processor.py +++ b/src/anemoi/models/layers/processor.py @@ -40,7 +40,7 @@ def __init__( num_chunks: int = 2, activation: str = "GELU", cpu_offload: bool = False, - grid_lat_coslon_sinlon: Optional[Tensor] = None, + positional_encoding_hidden: Optional[Tensor] = None, **kwargs, ) -> None: """Initialize BaseProcessor.""" @@ -50,7 +50,7 @@ def __init__( self.num_chunks = num_chunks self.num_channels = num_channels self.chunk_size = num_layers // num_chunks - self.grid_lat_coslon_sinlon = grid_lat_coslon_sinlon + self.positional_encoding_hidden = positional_encoding_hidden assert ( num_layers % num_chunks == 0 @@ -96,7 +96,7 @@ def __init__( num_chunks: int = 2, activation: str = "GELU", cpu_offload: bool = False, - grid_lat_coslon_sinlon: Optional[Tensor] = None, + positional_encoding_hidden: Optional[Tensor] = None, num_heads: int = 16, mlp_hidden_ratio: int = 4, dropout_p: float = 0.1, @@ -128,7 +128,7 @@ def __init__( num_chunks=num_chunks, activation=activation, cpu_offload=cpu_offload, - grid_lat_coslon_sinlon=grid_lat_coslon_sinlon, + positional_encoding_hidden=positional_encoding_hidden, num_heads=num_heads, mlp_hidden_ratio=mlp_hidden_ratio, ) @@ -141,7 +141,7 @@ def __init__( num_layers=self.chunk_size, window_size=window_size, activation=activation, - grid_lat_coslon_sinlon=grid_lat_coslon_sinlon, + positional_encoding_hidden=positional_encoding_hidden, dropout_p=dropout_p, ) diff --git a/src/anemoi/models/models/encoder_processor_decoder.py b/src/anemoi/models/models/encoder_processor_decoder.py index 33ce347b..4c07f1dd 100644 --- a/src/anemoi/models/models/encoder_processor_decoder.py +++ b/src/anemoi/models/models/encoder_processor_decoder.py @@ -76,22 +76,22 @@ def __init__( dst_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden], ) - latlons_hidden = self.node_attributes.get_coordinates(self._graph_name_hidden) - lat_coslon_sinlon_hidden = torch.cat( # lat, cos(lon), sin(lon) for hidden grid points - ( - latlons_hidden[:, 0].unsqueeze(-1), - torch.cos(latlons_hidden[:, 1].unsqueeze(-1)), - torch.sin(latlons_hidden[:, 1].unsqueeze(-1)), - ), - dim=-1, - ) + positional_encoding_hidden = None + if model_config.model.get("positional_encoding") is not None: + LOGGER.info( + "Using positional encoding. Target function: %s", model_config.model.positional_encoding._target_ + ) + self.positional_encoding = instantiate(model_config.model.positional_encoding) + positional_encoding_hidden = self.positional_encoding.positional_encoding( + self.node_attributes.get_coordinates(self._graph_name_hidden) + ) # Processor hidden -> hidden self.processor = instantiate( model_config.model.processor, num_channels=self.num_channels, sub_graph=self._graph_data[(self._graph_name_hidden, "to", self._graph_name_hidden)], - grid_lat_coslon_sinlon=lat_coslon_sinlon_hidden, + positional_encoding_hidden=positional_encoding_hidden, src_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden], dst_grid_size=self.node_attributes.num_nodes[self._graph_name_hidden], ) From 59a544574141f15e18b9f8a46568ace80e2a3525 Mon Sep 17 00:00:00 2001 From: sahahner Date: Mon, 16 Dec 2024 15:31:40 +0000 Subject: [PATCH 6/7] hydra instantiation for positional encoding implementing different functions --- src/anemoi/models/layers/positionalencoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anemoi/models/layers/positionalencoding.py b/src/anemoi/models/layers/positionalencoding.py index 867603c1..06b454d8 100644 --- a/src/anemoi/models/layers/positionalencoding.py +++ b/src/anemoi/models/layers/positionalencoding.py @@ -31,7 +31,7 @@ class LatCosSinLon(BasePositionalEncoding): def positional_encoding(self, latlons_hidden: Tensor) -> Tensor: """Output lat, cos(lon), sin(lon) for grid points.""" - lat_coslon_sinlon_hidden = torch.cat( # lat, cos(lon), sin(lon) for hidden grid points + lat_coslon_sinlon_hidden = torch.cat( ( latlons_hidden[:, 0].unsqueeze(-1), torch.cos(latlons_hidden[:, 1].unsqueeze(-1)), From 2c78573cdaf83b0cc8f6326bcdb080f28272fa78 Mon Sep 17 00:00:00 2001 From: sahahner Date: Fri, 3 Jan 2025 14:21:23 +0000 Subject: [PATCH 7/7] delete old comment --- models/src/anemoi/models/layers/block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/src/anemoi/models/layers/block.py b/models/src/anemoi/models/layers/block.py index 6b04199d..afcb776e 100644 --- a/models/src/anemoi/models/layers/block.py +++ b/models/src/anemoi/models/layers/block.py @@ -85,7 +85,7 @@ def __init__( if self.positional_encoding_hidden is not None: self.pos_embedder = nn.Linear( self.positional_encoding_hidden.shape[-1], num_channels - ) # assuming that we have 3 position features, lat and cos / sin of lon + ) # hidden_dim is num_channels self.attention = MultiHeadSelfAttention( num_heads=num_heads,