% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/layers-attention.R
\name{layer_attention}
\alias{layer_attention}
\title{Dot-product attention layer, a.k.a. Luong-style attention.}
\usage{
layer_attention(
  object,
  use_scale = FALSE,
  score_mode = "dot",
  dropout = 0,
  seed = NULL,
  ...
)
}
\arguments{
\item{object}{Object to compose the layer with. A tensor, array, or sequential model.}

\item{use_scale}{If \code{TRUE}, will create a scalar variable to scale the
attention scores.}

\item{score_mode}{Function to use to compute attention scores, one of
\verb{\{"dot", "concat"\}}. \code{"dot"} refers to the dot product between the
query and key vectors. \code{"concat"} refers to the hyperbolic tangent
of the concatenation of the \code{query} and \code{key} vectors.}

\item{dropout}{Float between 0 and 1. Fraction of the units to drop for the
attention scores. Defaults to \code{0.0}.}

\item{seed}{An integer to use as random seed incase of \code{dropout}.}

\item{...}{For forward/backward compatability.}
}
\value{
The return value depends on the value provided for the first argument.
If  \code{object} is:
\itemize{
\item a \code{keras_model_sequential()}, then the layer is added to the sequential model
(which is modified in place). To enable piping, the sequential model is also
returned, invisibly.
\item a \code{keras_input()}, then the output tensor from calling \code{layer(input)} is returned.
\item \code{NULL} or missing, then a \code{Layer} instance is returned.
}
}
\description{
Inputs are a list with 2 or 3 elements:
\enumerate{
\item A \code{query} tensor of shape \verb{(batch_size, Tq, dim)}.
\item A \code{value} tensor of shape \verb{(batch_size, Tv, dim)}.
\item A optional \code{key} tensor of shape \verb{(batch_size, Tv, dim)}. If none
supplied, \code{value} will be used as a \code{key}.
}

The calculation follows the steps:
\enumerate{
\item Calculate attention scores using \code{query} and \code{key} with shape
\verb{(batch_size, Tq, Tv)}.
\item Use scores to calculate a softmax distribution with shape
\verb{(batch_size, Tq, Tv)}.
\item Use the softmax distribution to create a linear combination of \code{value}
with shape \verb{(batch_size, Tq, dim)}.
}
}
\section{Call Arguments}{
\itemize{
\item \code{inputs}: List of the following tensors:
\itemize{
\item \code{query}: Query tensor of shape \verb{(batch_size, Tq, dim)}.
\item \code{value}: Value tensor of shape \verb{(batch_size, Tv, dim)}.
\item \code{key}: Optional key tensor of shape \verb{(batch_size, Tv, dim)}. If
not given, will use \code{value} for both \code{key} and \code{value}, which is
the most common case.
}
\item \code{mask}: List of the following tensors:
\itemize{
\item \code{query_mask}: A boolean mask tensor of shape \verb{(batch_size, Tq)}.
If given, the output will be zero at the positions where
\code{mask==FALSE}.
\item \code{value_mask}: A boolean mask tensor of shape \verb{(batch_size, Tv)}.
If given, will apply the mask such that values at positions
where \code{mask==FALSE} do not contribute to the result.
}
\item \code{return_attention_scores}: bool, it \code{TRUE}, returns the attention scores
(after masking and softmax) as an additional output argument.
\item \code{training}: Python boolean indicating whether the layer should behave in
training mode (adding dropout) or in inference mode (no dropout).
\item \code{use_causal_mask}: Boolean. Set to \code{TRUE} for decoder self-attention. Adds
a mask such that position \code{i} cannot attend to positions \code{j > i}.
This prevents the flow of information from the future towards the
past. Defaults to \code{FALSE}.
}
}

\section{Output}{
Attention outputs of shape \verb{(batch_size, Tq, dim)}.
(Optional) Attention scores after masking and softmax with shape
\verb{(batch_size, Tq, Tv)}.
}

\seealso{
\itemize{
\item \url{https://keras.io/api/layers/attention_layers/attention#attention-class}
}

Other attention layers: \cr
\code{\link{layer_additive_attention}()} \cr
\code{\link{layer_group_query_attention}()} \cr
\code{\link{layer_multi_head_attention}()} \cr

Other layers: \cr
\code{\link{Layer}()} \cr
\code{\link{layer_activation}()} \cr
\code{\link{layer_activation_elu}()} \cr
\code{\link{layer_activation_leaky_relu}()} \cr
\code{\link{layer_activation_parametric_relu}()} \cr
\code{\link{layer_activation_relu}()} \cr
\code{\link{layer_activation_softmax}()} \cr
\code{\link{layer_activity_regularization}()} \cr
\code{\link{layer_add}()} \cr
\code{\link{layer_additive_attention}()} \cr
\code{\link{layer_alpha_dropout}()} \cr
\code{\link{layer_average}()} \cr
\code{\link{layer_average_pooling_1d}()} \cr
\code{\link{layer_average_pooling_2d}()} \cr
\code{\link{layer_average_pooling_3d}()} \cr
\code{\link{layer_batch_normalization}()} \cr
\code{\link{layer_bidirectional}()} \cr
\code{\link{layer_category_encoding}()} \cr
\code{\link{layer_center_crop}()} \cr
\code{\link{layer_concatenate}()} \cr
\code{\link{layer_conv_1d}()} \cr
\code{\link{layer_conv_1d_transpose}()} \cr
\code{\link{layer_conv_2d}()} \cr
\code{\link{layer_conv_2d_transpose}()} \cr
\code{\link{layer_conv_3d}()} \cr
\code{\link{layer_conv_3d_transpose}()} \cr
\code{\link{layer_conv_lstm_1d}()} \cr
\code{\link{layer_conv_lstm_2d}()} \cr
\code{\link{layer_conv_lstm_3d}()} \cr
\code{\link{layer_cropping_1d}()} \cr
\code{\link{layer_cropping_2d}()} \cr
\code{\link{layer_cropping_3d}()} \cr
\code{\link{layer_dense}()} \cr
\code{\link{layer_depthwise_conv_1d}()} \cr
\code{\link{layer_depthwise_conv_2d}()} \cr
\code{\link{layer_discretization}()} \cr
\code{\link{layer_dot}()} \cr
\code{\link{layer_dropout}()} \cr
\code{\link{layer_einsum_dense}()} \cr
\code{\link{layer_embedding}()} \cr
\code{\link{layer_feature_space}()} \cr
\code{\link{layer_flatten}()} \cr
\code{\link{layer_flax_module_wrapper}()} \cr
\code{\link{layer_gaussian_dropout}()} \cr
\code{\link{layer_gaussian_noise}()} \cr
\code{\link{layer_global_average_pooling_1d}()} \cr
\code{\link{layer_global_average_pooling_2d}()} \cr
\code{\link{layer_global_average_pooling_3d}()} \cr
\code{\link{layer_global_max_pooling_1d}()} \cr
\code{\link{layer_global_max_pooling_2d}()} \cr
\code{\link{layer_global_max_pooling_3d}()} \cr
\code{\link{layer_group_normalization}()} \cr
\code{\link{layer_group_query_attention}()} \cr
\code{\link{layer_gru}()} \cr
\code{\link{layer_hashed_crossing}()} \cr
\code{\link{layer_hashing}()} \cr
\code{\link{layer_identity}()} \cr
\code{\link{layer_integer_lookup}()} \cr
\code{\link{layer_jax_model_wrapper}()} \cr
\code{\link{layer_lambda}()} \cr
\code{\link{layer_layer_normalization}()} \cr
\code{\link{layer_lstm}()} \cr
\code{\link{layer_masking}()} \cr
\code{\link{layer_max_pooling_1d}()} \cr
\code{\link{layer_max_pooling_2d}()} \cr
\code{\link{layer_max_pooling_3d}()} \cr
\code{\link{layer_maximum}()} \cr
\code{\link{layer_mel_spectrogram}()} \cr
\code{\link{layer_minimum}()} \cr
\code{\link{layer_multi_head_attention}()} \cr
\code{\link{layer_multiply}()} \cr
\code{\link{layer_normalization}()} \cr
\code{\link{layer_permute}()} \cr
\code{\link{layer_random_brightness}()} \cr
\code{\link{layer_random_contrast}()} \cr
\code{\link{layer_random_crop}()} \cr
\code{\link{layer_random_flip}()} \cr
\code{\link{layer_random_rotation}()} \cr
\code{\link{layer_random_translation}()} \cr
\code{\link{layer_random_zoom}()} \cr
\code{\link{layer_repeat_vector}()} \cr
\code{\link{layer_rescaling}()} \cr
\code{\link{layer_reshape}()} \cr
\code{\link{layer_resizing}()} \cr
\code{\link{layer_rnn}()} \cr
\code{\link{layer_separable_conv_1d}()} \cr
\code{\link{layer_separable_conv_2d}()} \cr
\code{\link{layer_simple_rnn}()} \cr
\code{\link{layer_spatial_dropout_1d}()} \cr
\code{\link{layer_spatial_dropout_2d}()} \cr
\code{\link{layer_spatial_dropout_3d}()} \cr
\code{\link{layer_spectral_normalization}()} \cr
\code{\link{layer_string_lookup}()} \cr
\code{\link{layer_subtract}()} \cr
\code{\link{layer_text_vectorization}()} \cr
\code{\link{layer_tfsm}()} \cr
\code{\link{layer_time_distributed}()} \cr
\code{\link{layer_torch_module_wrapper}()} \cr
\code{\link{layer_unit_normalization}()} \cr
\code{\link{layer_upsampling_1d}()} \cr
\code{\link{layer_upsampling_2d}()} \cr
\code{\link{layer_upsampling_3d}()} \cr
\code{\link{layer_zero_padding_1d}()} \cr
\code{\link{layer_zero_padding_2d}()} \cr
\code{\link{layer_zero_padding_3d}()} \cr
\code{\link{rnn_cell_gru}()} \cr
\code{\link{rnn_cell_lstm}()} \cr
\code{\link{rnn_cell_simple}()} \cr
\code{\link{rnn_cells_stack}()} \cr
}
\concept{attention layers}
\concept{layers}
