U Mâh†bã@s¢UddlZddlmZmZmZddlmZddlmZm Z m Z mZmZm Z mZddlmZddlmZddlmZdd lmZdd lmZddlZddgZejjZd d„ZiZe e e fed<dd„Z dMdd„Z!e!ej"ƒddœe#dœdd„ƒZ$e!ej%ƒdNe#dœdd„ƒZ&e!ej'ƒdOe#dœdd„ƒZ(e!ej)ƒdPe#dœdd„ƒZ*dQee#ee#ee#e+e#dœdd „Z,e!ej-ej.gƒddœe#dœd!d"„ƒZ/e!ej0ƒe#dœd#d$„ƒZ1d%d&„Z2e!ej3ej4gƒddœe#dœd'd(„ƒZ5dd)œee e e#d*fe e#d*fe e#d*fee e#d*ffdœd+d,„Z6dd)œee e e#d*fe e#d*fe e#d*fee e#d*ffdœd-d.„Z7e!ej8d/d0ddœe#dœd1d2„ƒZ9e!ej:d/d0e#dœd3d4„ƒZ;d5d6„Zgƒddœe#dœd7d8„ƒZ?e!ej@d/d0e#dœd9d:„ƒZAe!ejBd/d0e#dœd;d<„ƒZCej"e$ej%e&ej'e(ej)e*ej-e/ej.e/ej0e1ej3e5ej4e5ej=e?ej>e?ej8e9ej:e;ej@eAejBeCiZd=d>„ZDd?d@dAdBdCgZEdDdE„ZFdFdG„ZGdHdI„ZHdJdK„ZIGdLd„deƒZJdS)RéN)Útree_mapÚtree_flattenÚtree_unflattené)Ú ModuleTracker)ÚListÚAnyÚDictÚOptionalÚUnionÚTupleÚIterator)Údefaultdict)ÚTorchDispatchMode)Úregister_decomposition©Úprod©ÚwrapsÚFlopCounterModeÚregister_flop_formulacCst|tjƒr|jS|S©N)Ú isinstanceÚtorchZTensorÚshape)Úi©rúJ/var/www/html/venv/lib/python3.8/site-packages/torch/utils/flop_counter.pyÚ get_shapesrÚ flop_registrycstˆƒddœ‡fdd„ ƒ}|S)N)Úout_valcs(tt|||fƒ\}}}ˆ|d|i|—ŽS)NÚ out_shape)rr)r ÚargsÚkwargsr!©ÚfrrÚnfszshape_wrapper..nfr©r%r&rr$rÚ shape_wrappersr(Fcs‡‡fdd„}|S)Ncs"ˆst|ƒ}tˆtdd|ƒ|S)NT)ÚregistryÚunsafe)r(rr)Zflop_formula©Úget_rawÚtargetsrrÚregister_fun"sz+register_flop_formula..register_funr)r-r,r.rr+rr!s)r!©Úreturnc Os,|\}}|\}}||kst‚||d|S)zCount flops for matmul.é©ÚAssertionError) Úa_shapeÚb_shaper!r"r#ÚmÚkÚk2ÚnrrrÚmm_flop*sr:cKs t||ƒS)zCount flops for addmm.)r:©Z self_shaper4r5r!r#rrrÚ addmm_flop5sr<cKsD|\}}}|\}}} ||ks t‚||ks,t‚||| d|} | S)z"Count flops for the bmm operation.r1r2)r4r5r!r#Úbr6r7Úb2r8r9ÚfloprrrÚbmm_flop:s r@cKs t||ƒS)z&Count flops for the baddbmm operation.)r@r;rrrÚbaddbmm_flopGsrA)Úx_shapeÚw_shaper!Ú transposedr0c CsJ|d}|r|n|dd…}|^}}}t|ƒt|ƒ|||d} | S)aCount flops for convolution. Note only multiplication is counted. Computation for bias are ignored. Flops for a transposed convolution are calculated as flops = (x_shape[2:] * prod(w_shape) * batch_size). Args: x_shape (list(int)): The input shape before convolution. w_shape (list(int)): The filter shape. out_shape (list(int)): The output shape after convolution. transposed (bool): is the convolution transposed Returns: int: the number of flops rr1Nr) rBrCr!rDZ batch_sizeZ conv_shapeZc_outZc_inZfilter_sizer?rrrÚconv_flop_countOs rEc Ost||||dS)zCount flops for convolution.©rD)rE) rBrCÚ_biasÚ_strideÚ_paddingÚ _dilationrDr!r"r#rrrÚ conv_flopvsrKcCs’dd„}d} | dr4t|dƒ}| t||||ƒ7} | drŽt|dƒ}|rn| t||ƒ||ƒ||ƒdd7} n | t||ƒ||ƒ||ƒdd7} | S)NcSs |d|dgt|dd…ƒS)Nrrr1)Úlist)rrrrÚt‹szconv_backward_flop..trrFrF)rrE)Úgrad_out_shaperBrCrGrHrIrJrDZ_output_paddingZ_groupsZoutput_maskr!rMÚ flop_countZgrad_input_shapeZgrad_weight_shaperrrÚconv_backward_flop|sH" rPcCs¼|\}}}}|\}}} } |\}}} }||kr8|krlnn0||krP|krlnn|| krl| | krl|| kspt‚d}|t||||f|||| fƒ7}|t|||| f||| |fƒ7}|S)z^ Count flops for self-attention. NB: We can assume that value_shape == key_shape r©r3r@)Úquery_shapeÚ key_shapeÚvalue_shaper=ÚhÚs_qÚd_qÚ_b2Ú_h2Ús_kÚ_d2Ú_b3Ú_h3Ú_s3Úd_vÚtotal_flopsrrrÚsdpa_flop_countäsL""racOst|||ƒS)úCount flops for self-attention.©ra)rRrSrTr!r"r#rrrÚ sdpa_flopösrd)Úgrad_out.ccsN|dk r&t|jƒdkst‚t|jƒdks.t‚|dksF|j|jksFt‚|j\}} } |j\}}}|j\}} }|dk svt‚|dk s‚t‚|j|jks’t‚|dd…|dd… ¡}|dd…|dd… ¡}t||ƒD]L\}}d| || f}d|||f}d| ||f}|dk r|nd}||||fVqÔdS|j|j|j|dk rB|jndfVdS)a; Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for each batch element. In the case that this isn't a NestedTensor kernel, then it just yields the original shapes. Néréÿÿÿÿ©Úlenrr3ÚtolistÚzip)ÚqueryÚkeyÚvaluereÚ cum_seq_qÚ cum_seq_kÚmax_qÚmax_kÚ_Úh_qrWÚh_kÚd_kÚh_vr_Z seq_q_lengthsZ seq_k_lengthsZ seq_q_lenZ seq_k_lenÚnew_query_shapeÚ new_key_shapeÚnew_value_shapeÚnew_grad_out_shaperrrÚ%_unpack_flash_attention_nested_shapesýs( r|ccsT|dk r,t|jƒdkst‚t|jƒdks.t‚|dksF|j|jksFt‚|j\}}} } |j\}}}}|j\}}} }|dk s|t‚|dk sˆt‚|j|jks˜t‚|dd…|dd… ¡}|dd…|dd… ¡}t||ƒD]L\}}d| || f}d|||f}d| ||f}|dk r|nd}||||fVqÚdS|j|j|j|dk rH|jndfVdS)a? Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for each batch element. In the case that this isn't a NestedTensor kernel, then it just yields the original shapes. Nérrgrh)rlrmrnreÚcu_seqlens_qÚcu_seqlens_kÚmax_seqlen_qÚmax_seqlen_krsrtrWrurvrwr_Z seqlens_qZ seqlens_kÚlen_qZlen_krxryrzr{rrrÚ)_unpack_efficient_attention_nested_shapes+s( rƒT)r,c Os(t|||||||d} tdd„| DƒƒS)rb)rlrmrnrorprqrrcss"|]\}}}}t|||ƒVqdSrrc©Ú.0rRrSrTrsrrrÚ us ÿz0_flash_attention_forward_flop..©r|Úsum)rlrmrnrorprqrrr!r"r#ÚsizesrrrÚ_flash_attention_forward_flop[sù þrŠc Os(t|||||||d} tdd„| DƒƒS)rb)rlrmrnr~rr€rcss"|]\}}}}t|||ƒVqdSrrcr„rrrr†•s ÿz4_efficient_attention_forward_flop..©rƒrˆ)rlrmrnÚbiasr~rr€rr"r#r‰rrrÚ!_efficient_attention_forward_flop{sù þrcCsRd}|\}}}}|\} } }}|\} }}}|\}}}}|| krR| krR|kr€nn*|| krt|krt|kr€nn||ks„t‚||krœ||krœ||ks t‚d}|t||||f||||fƒ7}|t||||f||||fƒ7}|t||||f||||fƒ7}|t||||f||||fƒ7}|t||||f||||fƒ7}|S)NrrQ)rNrRrSrTr`r=rUrVrWrXrYrZr[r\r]r^r_Z_b4Z_h4Z_s4Z_d4rrrÚsdpa_backward_flop_count›sP"""""rŽcOst||||ƒS)z(Count flops for self-attention backward.©rŽ)rNrRrSrTr!r"r#rrrÚsdpa_backward_flop¶src Os*t|||||||| d}tdd„|DƒƒS)N)rlrmrnrerorprqrrcss$|]\}}}}t||||ƒVqdSrr©r…rRrSrTrNrrrr†Ös ÿz1_flash_attention_backward_flop..r‡) rerlrmrnÚoutZ logsumexprorprqrrr"r#ÚshapesrrrÚ_flash_attention_backward_flop»sø þr”c Os*t|||||||| d}tdd„|DƒƒS)N)rlrmrnrer~rr€rcss$|]\}}}}t||||ƒVqdSrrr‘rrrr†÷s ÿz5_efficient_attention_backward_flop..r‹) rerlrmrnrŒr’r~rr€rr"r#r“rrrÚ"_efficient_attention_backward_flopÜsø þr•cCst|tƒs|fS|Sr)rÚtuple)ÚxrrrÚnormalize_tuples r˜ÚÚKÚMÚBÚTcCs0tdtttƒdtt|ƒƒddƒƒ}t|S)Nrrr1rf)ÚmaxÚminriÚsuffixesÚstr)ÚnumberÚindexrrrÚget_suffix_strs(r¤cCs&t |¡}|d|d›}|t|S)Nièz.3f)r r£)r¢Úsuffixr£rnrrrÚconvert_num_with_suffixs r¦cCs|dkrdS||d›S)Nrú0%z.2%r)ÚnumÚdenomrrrÚconvert_to_percent_str&srªcstˆƒ‡fdd„ƒ}|S)Ncst|ƒ\}}ˆ|Ž}t||ƒSr)rr)r"Z flat_argsÚspecr’r$rrr&,sz)_pytreeify_preserve_structure..nfrr'rr$rÚ_pytreeify_preserve_structure+sr¬cs®eZdZdZdeeejje ejjfe eeee e fdœdd„Ze dœd d „Zeeee e ffdœdd„Zdd d„Z‡fdd„Z‡fdd„Zddd„Zdd„Z‡ZS)raþ ``FlopCounterMode`` is a context manager that counts the number of flops within its context. It does this using a ``TorchDispatchMode``. It also supports hierarchical output by passing a module (or list of modules) to FlopCounterMode on construction. If you do not need hierarchical output, you do not need to use it with a module. Example usage .. code-block:: python mod = ... with FlopCounterMode(mod) as flop_counter: mod.sum().backward() Nr1T)ÚmodsÚdepthÚdisplayÚcustom_mappingcCs`tdd„ƒ|_||_||_|dkr&i}|dk rOóz*FlopCounterMode.__init__..zXsz,FlopCounterMode.__init__..) rÚflop_countsr®r¯ÚwarningsÚwarnrÚitemsrÚmod_tracker)Úselfrr®r¯r°rrrÚ__init__IsþzFlopCounterMode.__init__r/cCst|jd ¡ƒS)NÚGlobal)rˆr¹Úvalues©r¾rrrÚget_total_flops\szFlopCounterMode.get_total_flopscCsdd„|j ¡DƒS)aReturn the flop counts as a dictionary of dictionaries. The outer dictionary is keyed by module name, and the inner dictionary is keyed by operation name. Returns: Dict[str, Dict[Any, int]]: The flop counts as a dictionary. cSsi|]\}}|t|ƒ“qSr)Údictr¶rrrr¸isz3FlopCounterMode.get_flop_counts..)r¹r¼rÂrrrÚget_flop_counts_s zFlopCounterMode.get_flop_countscs|dkrˆj}|dkrd}ddl}d|_dddg}g}ˆ ¡‰tˆƒ‰d‰‡‡‡‡fdd „}tˆj ¡ƒD]>}|d krxqj| d¡d}||krqj|||dƒ}| |¡qjd ˆjkrðˆsðt |ƒD] \} } d || d|| d<qÀ|d dƒ|}t|ƒdkr d ddgg}|j||ddS)Ni?BrTÚModuleZFLOPz% TotalFcsŽtˆj| ¡ƒ}ˆ|ˆkO‰d|}g}| ||t|ˆƒt|ˆƒg¡ˆj| ¡D]0\}}| |dt|ƒt|ˆƒt|ˆƒg¡qX|S)Nú z - )rˆr¹rÁÚappendr¦rªr¼r¡)Úmod_namer®r`ÚpaddingrÁr7r·©Zglobal_flopsZ global_suffixZis_global_subsumedr¾rrÚprocess_modys ýýz.FlopCounterMode.get_table..process_modrÀÚ.rrÇÚ0r§)ÚleftÚrightrÐ)ÚheadersZcolalign)r®ÚtabulateZPRESERVE_WHITESPACErÃr¤Úsortedr¹ÚkeysÚcountÚextendÚ enumerateri)r¾r®rÒÚheaderrÁrÌÚmodZ mod_depthZ cur_valuesÚidxrnrrËrÚ get_tableks6 zFlopCounterMode.get_tablecs"|j ¡|j ¡tƒ ¡|Sr)r¹Úclearr½Ú __enter__ÚsuperrÂ©Ú __class__rrrÝ§s zFlopCounterMode.__enter__cs0tƒj|Ž|j ¡|jr,t| |j¡ƒdSr)rÞÚ__exit__r½r¯ÚprintrÛr®)r¾r"rßrrrás zFlopCounterMode.__exit__rcCs(|r|ni}|||Ž}| |j|||¡Sr)Ú_count_flopsZ_overloadpacket)r¾ÚfuncÚtypesr"r#r’rrrÚ__torch_dispatch__³s z"FlopCounterMode.__torch_dispatch__cCsR||jkrN|j|}|||d|i—Ž}t|jjƒD]}|j|||7<q2|S)Nr )rÚsetr½Úparentsr¹)r¾Zfunc_packetr’r"r#Zflop_count_funcrOZparrrrrã¸s zFlopCounterMode._count_flops)Nr1TN)N)rN)Ú__name__Ú __module__Ú__qualname__Ú__doc__r rrÚnnrÆrr±Úboolr rr¿rÃr¡rÅrÛrÝrárærãÚ __classcell__rrrßrr5s$ûû < )F)N)N)N)F)KrZtorch.utils._pytreerrrZmodule_trackerrÚtypingrrr r rrr ÚcollectionsrZtorch.utils._python_dispatchrZ torch._decomprÚmathrÚ functoolsrrºÚ__all__ZopsZatenrrÚ__annotations__r(rÚmmr±r:Zaddmmr<Zbmmr@ZbaddbmmrArîrEZconvolutionZ_convolutionrKZconvolution_backwardrPraZ'_scaled_dot_product_efficient_attentionZ#_scaled_dot_product_flash_attentionrdr|rƒZ_flash_attention_forwardrŠZ_efficient_attention_forwardrrŽZ0_scaled_dot_product_efficient_attention_backwardZ,_scaled_dot_product_flash_attention_backwardrZ_flash_attention_backwardr”Z_efficient_attention_backwardr•r˜r r¤r¦rªr¬rrrrrÚsÎ$ üû' ôgû 6ö3û 6ö0 ÷õõó ó!ñ