U
    Mhb                     @   s  U d dl Z d dlmZmZmZ ddlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlZddgZe jjZdd Zi Ze
e	e	f ed< dd Z dMddZ!e!ej"dde#dddZ$e!ej%dNe#dddZ&e!ej'dOe#dddZ(e!ej)dPe#dddZ*dQee# ee# ee# e+e#ddd Z,e!ej-ej.gdde#dd!d"Z/e!ej0e#dd#d$Z1d%d& Z2e!ej3ej4gdde#dd'd(Z5dd)eeee#d*f ee#d*f ee#d*f eee#d*f  f  dd+d,Z6dd)eeee#d*f ee#d*f ee#d*f eee#d*f  f  dd-d.Z7e!ej8d/d0dde#dd1d2Z9e!ej:d/d0e#dd3d4Z;d5d6 Z<e!ej=ej>gdde#dd7d8Z?e!ej@d/d0e#dd9d:ZAe!ejBd/d0e#dd;d<ZCej"e$ej%e&ej'e(ej)e*ej-e/ej.e/ej0e1ej3e5ej4e5ej=e?ej>e?ej8e9ej:e;ej@eAejBeCiZd=d> ZDd?d@dAdBdCgZEdDdE ZFdFdG ZGdHdI ZHdJdK ZIG dLd deZJdS )R    N)tree_maptree_flattentree_unflatten   )ModuleTracker)ListAnyDictOptionalUnionTupleIterator)defaultdict)TorchDispatchMode)register_decompositionprodwrapsFlopCounterModeregister_flop_formulac                 C   s   t | tjr| jS | S N)
isinstancetorchZTensorshape)i r   J/var/www/html/venv/lib/python3.8/site-packages/torch/utils/flop_counter.py	get_shape   s    r   flop_registryc                    s   t  d d fdd
}|S )N)out_valc                    s(   t t||| f\}}} |d|i|S )N	out_shape)r   r   )r    argskwargsr!   fr   r   nf   s    zshape_wrapper.<locals>.nfr   r%   r&   r   r$   r   shape_wrapper   s    r(   Fc                    s    fdd}|S )Nc                    s"    st | } ttdd|  | S )NT)registryunsafe)r(   r   r   )Zflop_formulaget_rawtargetsr   r   register_fun"   s    z+register_flop_formula.<locals>.register_funr   )r-   r,   r.   r   r+   r   r   !   s    )r!   returnc          	      O   s,   | \}}|\}}||kst || d | S )zCount flops for matmul.   AssertionError)	a_shapeb_shaper!   r"   r#   mkk2nr   r   r   mm_flop*   s    r:   c                 K   s
   t ||S )zCount flops for addmm.)r:   Z
self_shaper4   r5   r!   r#   r   r   r   
addmm_flop5   s    r<   c                 K   sD   | \}}}|\}}}	||ks t ||ks,t || |	 d | }
|
S )z"Count flops for the bmm operation.r1   r2   )r4   r5   r!   r#   br6   r7   b2r8   r9   flopr   r   r   bmm_flop:   s    

r@   c                 K   s
   t ||S )z&Count flops for the baddbmm operation.)r@   r;   r   r   r   baddbmm_flopG   s    rA   )x_shapew_shaper!   
transposedr0   c           
      C   sJ   | d }|r| n|dd }|^}}}t |t | | | | d }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   r1   Nr   )
rB   rC   r!   rD   Z
batch_sizeZ
conv_shapeZc_outZc_inZfilter_sizer?   r   r   r   conv_flop_countO   s
    
 rE   c          
      O   s   t | |||dS )zCount flops for convolution.rD   )rE   )
rB   rC   _bias_stride_padding	_dilationrD   r!   r"   r#   r   r   r   	conv_flopv   s    rK   c                 C   s   dd }d}|
d r4t |d }|t| ||| 7 }|
d rt |d }|rn|t|| ||||dd7 }n |t|||| ||dd7 }|S )Nc                 S   s    | d | d gt | dd   S )Nr   r   r1   )list)r   r   r   r   t   s    zconv_backward_flop.<locals>.tr   r   FrF   )r   rE   )grad_out_shaperB   rC   rG   rH   rI   rJ   rD   Z_output_paddingZ_groupsZoutput_maskr!   rM   
flop_countZgrad_input_shapeZgrad_weight_shaper   r   r   conv_backward_flop|   s    H" rP   c                 C   s   | \}}}}|\}}}	}
|\}}}}||  kr8|krln n0||  krP|krln n||
krl|	|krl||
kspt d}|t|| ||f|| ||	f7 }|t|| ||	f|| |	|f7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   r3   r@   )query_shape	key_shapevalue_shaper=   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopsr   r   r   sdpa_flop_count   s    L""ra   c                O   s   t | ||S )Count flops for self-attention.ra   )rR   rS   rT   r!   r"   r#   r   r   r   	sdpa_flop   s    rd   )grad_out.c                 c   sN  |dk	r&t |jdkstt |jdks.t|dksF|j| jksFt| j\}}	}
|j\}}}|j\}}}|dk	svt|dk	st|j|jkst|dd |dd   }|dd |dd   }t||D ]L\}}d|	||
f}d|||f}d|||f}|dk	r|nd}||||fV  qdS | j|j|j|dk	rB|jndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   lenr   r3   tolistzip)querykeyvaluere   	cum_seq_q	cum_seq_kmax_qmax_k_h_qrW   h_kd_kh_vr_   Zseq_q_lengthsZseq_k_lengthsZ	seq_q_lenZ	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shaper   r   r   %_unpack_flash_attention_nested_shapes   s(    
r|   c                 c   sT  |dk	r,t |jdkstt |jdks.t|dksF|j| jksFt| j\}}}	}
|j\}}}}|j\}}}}|dk	s|t|dk	st|j|jkst|dd |dd   }|dd |dd   }t||D ]L\}}d|	||
f}d|||f}d|||f}|dk	r|nd}||||fV  qdS | j|j|j|dk	rH|jndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   rg   rh   )rl   rm   rn   re   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krs   rt   rW   ru   rv   rw   r_   Z	seqlens_qZ	seqlens_klen_qZlen_krx   ry   rz   r{   r   r   r   )_unpack_efficient_attention_nested_shapes+  s(    
r   T)r,   c             	   O   s(   t | ||||||d}
tdd |
D S )rb   )rl   rm   rn   ro   rp   rq   rr   c                 s   s"   | ]\}}}}t |||V  qd S r   rc   .0rR   rS   rT   rs   r   r   r   	<genexpr>u  s   
z0_flash_attention_forward_flop.<locals>.<genexpr>r|   sum)rl   rm   rn   ro   rp   rq   rr   r!   r"   r#   sizesr   r   r   _flash_attention_forward_flop[  s    	r   c              	   O   s(   t | ||||||d}
tdd |
D S )rb   )rl   rm   rn   r~   r   r   r   c                 s   s"   | ]\}}}}t |||V  qd S r   rc   r   r   r   r   r     s   
z4_efficient_attention_forward_flop.<locals>.<genexpr>r   r   )rl   rm   rn   biasr~   r   r   r   r"   r#   r   r   r   r   !_efficient_attention_forward_flop{  s    	r   c                 C   sR  d}|\}}}}|\}	}
}}|\}}}}| \}}}}||	  krR|  krR|krn n*||
  krt|  krt|krn n||kst ||kr||kr||kst d}|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|t|| ||f|| ||f7 }|S )Nr   rQ   )rN   rR   rS   rT   r`   r=   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   Z_b4Z_h4Z_s4Z_d4r   r   r   sdpa_backward_flop_count  s    P"""""r   c                O   s   t | |||S )z(Count flops for self-attention backward.r   )rN   rR   rS   rT   r!   r"   r#   r   r   r   sdpa_backward_flop  s    r   c
              
   O   s*   t |||| ||||	d}tdd |D S )N)rl   rm   rn   re   ro   rp   rq   rr   c                 s   s$   | ]\}}}}t ||||V  qd S r   r   r   rR   rS   rT   rN   r   r   r   r     s   
z1_flash_attention_backward_flop.<locals>.<genexpr>r   )re   rl   rm   rn   outZ	logsumexpro   rp   rq   rr   r"   r#   shapesr   r   r   _flash_attention_backward_flop  s    
r   c
              
   O   s*   t |||| ||||	d}tdd |D S )N)rl   rm   rn   re   r~   r   r   r   c                 s   s$   | ]\}}}}t ||||V  qd S r   r   r   r   r   r   r     s   
z5_efficient_attention_backward_flop.<locals>.<genexpr>r   )re   rl   rm   rn   r   r   r~   r   r   r   r"   r#   r   r   r   r   "_efficient_attention_backward_flop  s    
r   c                 C   s   t | ts| fS | S r   )r   tuple)xr   r   r   normalize_tuple  s    
r    KMBTc                 C   s0   t dtttd tt| d d }t| S )Nr   r   r1   rf   )maxminri   suffixesstr)numberindexr   r   r   get_suffix_str  s    (r   c                 C   s&   t |}| d|  d}|t |  S )Ni  z.3f)r   r   )r   suffixr   rn   r   r   r   convert_num_with_suffix  s    
r   c                 C   s   |dkrdS | | dS )Nr   0%z.2%r   )numdenomr   r   r   convert_to_percent_str&  s    r   c                    s   t   fdd}|S )Nc                    s   t | \}} | }t||S r   )r   r   )r"   Z	flat_argsspecr   r$   r   r   r&   ,  s    z)_pytreeify_preserve_structure.<locals>.nfr   r'   r   r$   r   _pytreeify_preserve_structure+  s    r   c                       s   e Zd ZdZdeeejje	ejj f  e
eeeeef  dddZe
dd	d
Zeeeee
f f dddZdddZ fddZ fddZdddZdd Z  ZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    Nr1   T)modsdepthdisplaycustom_mappingc                 C   s`   t dd | _|| _|| _|d kr&i }|d k	r<tjddd tdd | D | _t | _	d S )Nc                   S   s   t tS r   )r   intr   r   r   r   <lambda>O      z*FlopCounterMode.__init__.<locals>.<lambda>z<mods argument is not needed anymore, you can stop passing itr1   )
stacklevelc                 S   s*   i | ]"\}}|t |d dr|nt|qS )Z_get_rawF)getattrr(   r   r7   vr   r   r   
<dictcomp>X  s      z,FlopCounterMode.__init__.<locals>.<dictcomp>)
r   flop_countsr   r   warningswarnr   itemsr   mod_tracker)selfr   r   r   r   r   r   r   __init__I  s    zFlopCounterMode.__init__r/   c                 C   s   t | jd  S )NGlobal)r   r   valuesr   r   r   r   get_total_flops\  s    zFlopCounterMode.get_total_flopsc                 C   s   dd | j  D S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 S   s   i | ]\}}|t |qS r   )dictr   r   r   r   r   i  s      z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>)r   r   r   r   r   r   get_flop_counts_  s    
zFlopCounterMode.get_flop_countsc                    s  |d krj }|d krd}dd l}d|_dddg}g }  t d fdd	}tj D ]>}|d
krxqj|dd }||krqj|||d }|	| qjd
jkrst
|D ] \}	}
d||	 d  ||	 d< q|d
d| }t|dkr
d
ddgg}|j||ddS )Ni?B r   TModuleZFLOPz% TotalFc                    s   t j|   }| kO d| }g }|||  t|t| g j|   D ]0\}}||d t| t|t| g qX|S )N z - )r   r   r   appendr   r   r   r   )mod_namer   r`   paddingr   r7   r   Zglobal_flopsZglobal_suffixZis_global_subsumedr   r   r   process_mody  s     z.FlopCounterMode.get_table.<locals>.process_modr   .r   r   0r   )leftrightr   )headersZcolalign)r   tabulateZPRESERVE_WHITESPACEr   r   sortedr   keyscountextend	enumerateri   )r   r   r   headerr   r   modZ	mod_depthZ
cur_valuesidxrn   r   r   r   	get_tablek  s6    
zFlopCounterMode.get_tablec                    s"   | j   | j  t   | S r   )r   clearr   	__enter__superr   	__class__r   r   r     s    


zFlopCounterMode.__enter__c                    s0   t  j|  | j  | jr,t| | j d S r   )r   __exit__r   r   printr   r   )r   r"   r   r   r   r     s    
zFlopCounterMode.__exit__r   c                 C   s(   |r|ni }|||}|  |j|||S r   )_count_flopsZ_overloadpacket)r   functypesr"   r#   r   r   r   r   __torch_dispatch__  s    
z"FlopCounterMode.__torch_dispatch__c                 C   sR   || j krN| j | }|||d|i}t| jjD ]}| j| |  |7  < q2|S )Nr    )r   setr   parentsr   )r   Zfunc_packetr   r"   r#   Zflop_count_funcrO   Zparr   r   r   r     s    

zFlopCounterMode._count_flops)Nr1   TN)N)r   N)__name__
__module____qualname____doc__r
   r   r   nnr   r   r   boolr	   r   r   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r   5  s$       
<
)F)N)N)N)F)Kr   Ztorch.utils._pytreer   r   r   Zmodule_trackerr   typingr   r   r	   r
   r   r   r   collectionsr   Ztorch.utils._python_dispatchr   Ztorch._decompr   mathr   	functoolsr   r   __all__ZopsZatenr   r   __annotations__r(   r   mmr   r:   Zaddmmr<   Zbmmr@   ZbaddbmmrA   r   rE   ZconvolutionZ_convolutionrK   Zconvolution_backwardrP   ra   Z'_scaled_dot_product_efficient_attentionZ#_scaled_dot_product_flash_attentionrd   r|   r   Z_flash_attention_forwardr   Z_efficient_attention_forwardr   r   Z0_scaled_dot_product_efficient_attention_backwardZ,_scaled_dot_product_flash_attention_backwardr   Z_flash_attention_backwardr   Z_efficient_attention_backwardr   r   r   r   r   r   r   r   r   r   r   r   <module>   s    $
	
 'g
63
60
 !               
