o
    =e!                     @   s   d dl mZ d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
 d aG dd dejZG d	d
 d
ejZG dd dejZdd ZdS )    )OptionalN)nnTensor)
checkpoint)MultiheadAttentionRelative)
get_clonesc                       sn   e Zd ZdZddededef fdd	Zd
ejdejdedefddZddejdejde	e fddZ
  ZS )TransformerzS
    Transformer computes self (intra image) and cross (inter image) attention
             
hidden_dimnheadnum_attn_layersc                    sX   t    t||}t||| _t||}t||| _t|| _	|| _
|| _|| _d S N)super__init__TransformerSelfAttnLayerr   self_attn_layersTransformerCrossAttnLayercross_attn_layersr   	LayerNormnormr   r   r   )selfr   r   r   Zself_attn_layerZcross_attn_layer	__class__ a/home/opencvuniv/Work/Pranav/stereo-transformer/repo-new/stereo-transformer/module/transformer.pyr      s   



zTransformer.__init__featpos_encpos_indexeshnc                 C   s   t t| j| jD ]A\}\}}|add }t|||||}|| jd kr*dd }	ndd }	t|	||ddd|f |dd|df ||\}}
q	da|
S )	az  
        Alternate self and cross attention with gradient checkpointing to save memory

        :param feat: image feature concatenated from left and right, [W,2HN,C]
        :param pos_enc: positional encoding, [W,HN,C]
        :param pos_indexes: indexes to slice positional encoding, [W,HN,C]
        :param hn: size of HN
        :return: attention weight [N,H,W,W]
        c                        fdd}|S )Nc                     s    |  S r   r   inputsmoduler   r   custom_self_attn7   s   zXTransformer._alternating_attn.<locals>.create_custom_self_attn.<locals>.custom_self_attnr   )r%   r&   r   r$   r   create_custom_self_attn6      z>Transformer._alternating_attn.<locals>.create_custom_self_attn   c                    r!   )Nc                         g | dR  S )NTr   r"   r$   r   r   custom_cross_attnB      ZTransformer._alternating_attn.<locals>.create_custom_cross_attn.<locals>.custom_cross_attnr   r%   r+   r   r$   r   create_custom_cross_attnA   r(   z?Transformer._alternating_attn.<locals>.create_custom_cross_attnc                    r!   )Nc                     r*   )NFr   r"   r$   r   r   r+   I   r,   r-   r   r.   r   r$   r   r/   H   r(   Nr   )	enumeratezipr   r   	layer_idxr   r   )r   r   r   r   r    idx	self_attn
cross_attnr'   r/   attn_weightr   r   r   _alternating_attn%   s   
.
zTransformer._alternating_attnN	feat_left
feat_rightc                 C   s  |j \}}}}|dddddddd}|dddddddd}|durkt 3 t|d d||d|j}td|d |d||j}	||	 d	 }
W d   n1 sew   Y  nd}
tj
||gdd}| |||
|}|||||dddd}|S )a7  
        :param feat_left: feature descriptor of left image, [N,C,H,W]
        :param feat_right: feature descriptor of right image, [N,C,H,W]
        :param pos_enc: relative positional encoding, [N,C,H,2W-1]
        :return: cross attention values [N,H,W,W], dim=2 is left image, dim=3 is right image
        r)         r   Ndim)shapepermuteflattentorchno_gradlinspaceviewtodevicelongcatr7   )r   r8   r9   r   bscr    wZ	indexes_rZ	indexes_cr   r   r6   r   r   r   forwardT   s   	  
""zTransformer.forward)r	   r
   r   r   )__name__
__module____qualname____doc__intr   rB   r   r7   r   rM   __classcell__r   r   r   r   r      s
    (/r   c                       sN   e Zd ZdZdedef fddZ		ddedee d	ee fd
dZ  Z	S )r   z
    Self attention layer
    r   r   c                    s&   t    t||| _t|| _d S r   )r   r   r   r4   r   r   norm1r   r   r   r   r   r   r   y   s   
z!TransformerSelfAttnLayer.__init__Nr   posr   c                 C   s0   |  |}| j|||||d\}}}|| }|S )z
        :param feat: image feature [W,2HN,C]
        :param pos: pos encoding [2W-1,HN,C]
        :param pos_indexes: indexes to slice pos encoding [W,W]
        :return: updated image feature
        querykeyvaluer   r   )rT   r4   )r   r   rV   r   Zfeat2r6   _r   r   r   rM      s   
	z TransformerSelfAttnLayer.forward)NN)
rN   rO   rP   rQ   rR   r   r   r   rM   rS   r   r   r   r   r   t   s    r   c                       sr   e Zd ZdZdedef fddZ			dded	ed
ee dee dee f
ddZ	e
 defddZ  ZS )r   z
    Cross attention layer
    r   r   c                    s2   t    t||| _t|| _t|| _d S r   )r   r   r   r5   r   r   rT   norm2rU   r   r   r   r      s   
z"TransformerCrossAttnLayer.__init__NFr8   r9   rV   r   
last_layerc                 C   s   |  |}|  |}|durt|dg}n|}| j|||||dd }|| }|r9|d}	| |	|j}
nd}
| |}| j||||
||d\}}}|| }tj	||gdd}||fS )ax  
        :param feat_left: left image feature, [W,HN,C]
        :param feat_right: right image feature, [W,HN,C]
        :param pos: pos encoding, [2W-1,HN,C]
        :param pos_indexes: indexes to slicer pos encoding [W,W]
        :param last_layer: Boolean indicating if the current layer is the last layer
        :return: update image feature and attention weight
        Nr   rW   )rX   rY   rZ   	attn_maskr   r   r)   r=   )
rT   rB   flipr5   size _generate_square_subsequent_maskrF   rG   r\   rI   )r   r8   r9   rV   r   r]   Zfeat_left_2Zfeat_right_2Zpos_flippedrL   r^   r6   Zraw_attnr   r   r   r   rM      s.   




z!TransformerCrossAttnLayer.forwardszc                 C   s*   t jt ||dd}td||dk< |S )z
        Generate a mask which is upper triangular

        :param sz: square matrix size
        :return: diagonal binary mask [sz,sz]
        r)   )diagonalz-inf)rB   triuonesfloat)r   rb   maskr   r   r   ra      s   z:TransformerCrossAttnLayer._generate_square_subsequent_mask)NNF)rN   rO   rP   rQ   rR   r   r   r   boolrM   rB   rC   ra   rS   r   r   r   r   r      s    
2r   c                 C   s   t | j| j| jdS )N)r   r   r   )r   channel_dimZnheadsr   )argsr   r   r   build_transformer   s
   rk   )typingr   rB   r   r   torch.utils.checkpointr   Zmodule.attentionr   utilities.miscr   r2   Moduler   r   r   rk   r   r   r   r   <module>   s   c"K