U
    zh                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
mZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlm  mZ d dlmZ d dlmZmZ mZ!mZ" d dl#m$Z$m%Z%m&Z&m'Z' d dl(mZ) d d	l*m+Z+m,Z, d d
l-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD d dlEmFZF d dlGmHZH ddlImJZJ ddlKmLZL ddlMmNZN ddlOmZmPZP ddlQmRZR ddlSmTZT ddlUmVZV ddlWmXZXmYZY ddlZm[Z[ ddl\m]Z] dd l^m_Z_ dd!lm`Z`maZambZbmcZc dd"ldmeZe ef r^d d#lgmhZhmiZi nejd$d%d&ZiekelZmejnoeld'Zpejnoeld(Zqd)Zrd*d+ Zsejteeu ejtd,d-d.Zvejtewd/d0d1Zxd2d3 Zyezdd4d5 Z{ezdd6d7 Z|d8d9 Z}d:d; Z~d<d= Zd>d? ZdewdAdBdCZejjeejjeejeuf f dDdEdFZejjdGdHdIZeejt dJdKdLZdejjeejt ewdMdNdOZdPdQ Zdeeje
f dRdSdTZejdUdV ZeRjejj eidWd$eLe jee"jdXd@dYdejjeejt ee8 eeeu  eweeu ewewewee2 eeejdf  eew eeee_ ge
f  ee/ejf dZd[d\Ze" dejjeejt ee8 eeeu  eweeu eweweweeejdf  eew eeee_ ge
f  ee/ejf d]d^d_Zejtd`dadbZeejt eeu ddcdddeZeeejt eeu f eeu eeu dfdgdhZeeejt ge
f eeu didjdkZe"jddldldldmejjeejt eeu eueeej  eweweejtdnf eejjdnf eeudnf do
dpdqZeeejt eeu f eeu drdsdtZejtd`dudvZejtejteeu dwdxdyZdejjeejt eeu dzd{d|Zedfejjeejt edne
f eeeje
f  d}d~dZed Zejjeejt ejjeuedne
f e8eue2dddZeLe jeddfejjeejt edne
f eeeje
f  eee@edne
f f  dddZeejt dddZejjdGddZejjeejt edne
f dddZejjeejt edne
f dddZe]ddddZdS )    N)count)AnyCallableDictListOptionalSequenceTupleUnion)mock)#min_cut_rematerialization_partition)compiled_autogradconfigloggingutils)countersdetect_fake_modeflatten_graph_inputslazy_format_graph_code)r   )aot_export_modulemake_boxed_func)	code_hashCompiledFxGraphFxGraphCache)BoxedDeviceIndexget_placeholders#log_cudagraph_skip_and_bump_counter)save_args_for_compile_fx_inner)	BoxedBoolcount_tangentsfresh_inductor_cacheshould_assume_input_alignedtensor_is_aligned)trace_structured)
OpOverload)
FakeTensor)compile_time_strobelight_meta)free_unbacked_symbols)FakeTensorProp   )aot_autograd)_use_lazy_graph_module)_PyTreeCodeGen   )r   metrics)DebugContext)select_decomp_table)joint_graph_passes)post_grad_passesview_to_reshape)pre_grad_passes)GraphLowering)ExternKernelNode) get_cloned_parameter_buffer_namehas_incompatible_cudagraph_ops#maybe_get_suppress_shape_guards_ctxoutput_node)V)log_optimus_to_scubatime_and_logattrc                 C   s   t jS N)dynamo_utilsidentityr>    rC   L/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/compile_fx.pyr=   P   s    r=   Z
perf_hintsZpost_grad_graphs   c                    s(   t  tjsd S  fddt jD S )Nc                    s,   g | ]$}  |d kr |dkr|qS )r   r-   )stridesize.0itrC   rD   
<listcomp>a   s       z%get_expanded_dims.<locals>.<listcomp>)
isinstancetorchTensorrangendimrK   rC   rK   rD   get_expanded_dims^   s    rS   )rL   expanded_dimsreturnc                 C   s"   |D ]}t jj| |dd} q| S )Nr   r-   )rO   opsatenslice)rL   rT   Zexpanded_dimrC   rC   rD   index_expanded_dimsd   s    rY   )rL   rU   c                 C   s   t | t|  } t| dkr|  }| j}ttt	|}dd t
t||D }tt	|D ]V}|dkrrdn|||d   }|dkrdn|||d   }|||  || k rb dS qbdS )Nr   c                 S   s   g | ]\}}|qS rC   rC   )rI   _xrC   rC   rD   rM   t   s     z*complex_memory_overlap.<locals>.<listcomp>r-   TF)rY   rS   ZsqueezerO   Z_debug_has_internal_overlaprF   shapelistrQ   lensortedzip)rL   stridessizesindicesrJ   Zprev_strideZ	prev_sizerC   rC   rD   complex_memory_overlapj   s    rd   c                 C   s2   t jj }tt| }|r"|js&|S ||jj S r@   )rO   _guardsTracingContexttry_getr]   rQ   fw_metadatastatic_parameter_indices)	num_fixedcontextfixedrC   rC   rD   get_static_input_idxs}   s
    
rm   c                   C   s
   t tS r@   )dynamo_loggingZget_step_loggerlogrC   rC   rC   rD   _step_logger   s    rp   c                   C   s2   t j r.t jjjjs.t j dkr.td d S )N)   r   zTensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.)	rO   cudaZis_availablebackendsmatmulZ
allow_tf32Zget_device_capabilitywarningswarnrC   rC   rC   rD   _warn_tf32_disabled   s    
rw   c                 C   s~  ddl m}m} i }| jddD ]"\}}|||< |||||jd q | jddD ]"\}}|||< |||||jd qP|jjdd}	g }
|	D ]x}|j	}||j
kr|j
| }|
| q||jkr|j| }|
| t|| |jt|< q||jkst|
d  qddlm} t|jjd	 jd }g }|D ]2}|j	|jkrT||j|j	  n
|d  q.|||
|t d |i }|S )
Nr   )_assign_attr	_AttrKindF)Zremove_duplicate)Z	attr_kindplaceholder)op)_unlift)Ztorch.export.unflattenrx   ry   Znamed_parametersZ	PARAMETERZnamed_buffersZBUFFERgraph
find_nodesnameZinputs_to_parametersappendZinputs_to_buffersclone_preserve_stridesmetar7   Zuser_inputsAssertionErrorZtorch.export._unliftr|   r]   nodesargsZbuffers_to_mutatepytreeZLeafSpec)modgmgraph_signaturerx   ry   Z
state_dictr   parambufferZplaceholder_nodesZlifted_inputsnodeZ	node_nameZparameter_nameZbuffer_namer|   outputsZmutated_outputsoutunlifted_gmrC   rC   rD   _unlift_graph   sf    





	r   c                 c   s   t t| jjdtjjjd| jjdtjjj	dD ]n}|j
tjjjkrn|jd j}|jd j}|V  |V  q4|j
tjjj	kr4|jd j}|jd j}|V  |V  q4d S )Ncall_functionr{   targetr-   r)   r   )r_   	itertoolschainr~   r   rO   rV   Zhigher_orderZcondZ
while_loopr   r   r   )r   r   Ztrue_subgraph_nameZfalse_subgraph_nameZcond_subgraph_nameZbody_subgraph_namerC   rC   rD   _get_subgraph_names   s&     r   c                 C   s:   t | D ]&}t| |}t|d d}t| || qt| |S )Nexample_inputs)r   getattr_recursive_pre_grad_passessetattrr4   )r   r   subgraph_namesubgraphZnew_subgraphrC   rC   rD   r      s
    
r   c                 C   s,   t | D ]}t| |}t| qt|  d S r@   )r   r   _recursive_joint_graph_passesr1   )r   r   r   rC   rC   rD   r      s    

r   Fis_inferencec                 C   s0   t | D ]}t| |}t|| qt| | d S r@   )r   r   _recursive_post_grad_passesr2   )r   r   r   r   rC   rC   rD   r     s    
r   )r   rU   c                 C   s4  ddl m}m}m}m}m} || }| }dd tt|jj	d j
d D }g }	g }
i }| jj	D ]2}|j|kr||
| qb|j| |krb|	| qb|
D ]4}d|j }|| ||||j  | ||j ||< q|	ddd D ]D}|jr|jD ]"}|j| |kstd| d	qq| j| q|   ||fS )
a  
    This function takes an GraphModule input "gm".
    The gm will be split into 2 components,
      1) const_gm, which consists the subgraph of gm that can be constant folded.
      2) gm (being inplace modified,) which returns the graph after constant folding.

    const_output_index is a mapping of corresponding node name from gm to the
    output index of const_gm.
    Returns (const_gm, const_output_index)
    r   )CONST_MODULE_TAGMETA_TAG
MODULE_TAGreplace_node_with_constantrun_and_get_constant_graphc                 S   s   i | ]\}}|j |qS rC   )r   rI   idxr[   rC   rC   rD   
<dictcomp>"  s     z"split_const_gm.<locals>.<dictcomp>r}   Z_FOLDED_CONST_Nznode: z user not empty.)Z torch._inductor.constant_foldingr   r   r   r   r   	enumeratetupler~   r   r   r   r   r   Zusersr   
erase_node	recompile)r   r   r   r   r   r   const_gmZconst_resultZconst_outputsZto_erase_nodeZto_replace_nodeconst_output_indexr   Znew_const_namenrC   rC   rD   split_const_gm
  s<    


"r   r   c                 C   s   t jj}|jj|jj|jj|jjh}|D ]\}| jj	d|dD ]F}t
|jdd t jr<|jd jt jkr<|jd jjdkr<  dS q<q(dS )Nr   r   valrr   TF)rO   rV   rW   mmdefaultZaddmmZbmmZbaddbmmr~   r   rN   r   getrP   dtypeZfloat32devicetype)r   rW   Ztf32_opsr   r   rC   rC   rD   is_tf32_warning_applicableC  s     r   r   c                 C   s>   t dd | D }tjr2|s2td tjddS t S dS )z
    For CPU backend, enable comprehensive padding causes some unit tests
    fail due to changing number of generated kernels. Skip for now.
    c                 s   s&   | ]}t |tjr|jjd kV  qdS )rr   N)rN   rO   rP   r   r   rI   rL   rC   rC   rD   	<genexpr>[  s     z6maybe_disable_comprehensive_padding.<locals>.<genexpr>z!Skip comprehensive padding on CPUF)comprehensive_paddingN)anyr   r   perf_hint_loginfopatch
contextlibnullcontext)r   Zhas_cudarC   rC   rD   #maybe_disable_comprehensive_paddingV  s    

r   )r   r   force_allow_non_fake_inputsc              	   C   sp   t |}|s.tjjdd}t| |dj|  n>|s:t ntj	
|dd}| t| |dj|  W 5 Q R X |S )z}
    If we can not detect fake mode from the context of inputs, create one.

    The created fake mode will be returned.
    Tallow_non_fake_inputs)moder   )r   rO   _subclassesFakeTensorModer(   	propagater   r   r   r   objectZpropagate_dont_convert_inputs)r   r   r   	fake_modectxrC   rC   rD   fake_tensor_propf  s    

r   c                  C   s^   t jr
dS t  sdS tjjd k	r&dS zddlm}  W n tk
rL   Y dS X | tj	
dkS )NTFr   MEMCACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)r   fx_graph_remote_cache	is_fbcoderO   versionZhipZtriton.runtime.fb_memcacher   ModuleNotFoundErrorZ_utils_internalZjustknobs_getval_intr   rC   rC   rD    should_use_remote_fx_graph_cache  s    r   rU   c              
   C   s*   t |  t  W  5 Q R  S Q R X d S r@   )r   r   Zget_config_copy)config_patchesrC   rC   rD   get_patched_config_dict  s    r   c              
   C   s.   t jr&t  | W  5 Q R  S Q R X n| S d S r@   )r   force_disable_cachesr    )frC   rC   rD   with_fresh_cache_if_config  s    r   zcompilation time (in seconds)Zinductor_compile)
phase_nameZfwd_only)r   r   
cudagraphsstatic_input_idxsis_backwardgraph_idcpp_wrapperaot_moder   boxed_forward_device_indexuser_visible_outputs
layout_optextern_node_serializerrU   c           !         sl  t | jdkr4|s4ddlm} ||  t| jS |dkr@g }tt	t
t| jjjd ttfsttd| j tjrt| |||||||||	|
|d |dkrttjj}||||||||
||d
}t }t }t||}tjsVtjs|rV|sVt|D ]6\}}t|tjr|jj dkr||krd|_!qt"j#t$| |||tj|d	}nt$| |f|}t%&d
t |  |r|j'rd|j(krt)d|j'  nt*d d  d7  < t+| tj,j-. }|dk	r|j/dk	rt0|j/dkst|j/1|j/ |r|S |rt2| }t0|jdks2tdd |jd D }t3dd |D }tjj4sddl5m6} || ||}|dk	}|r||_'nd}| dft7|  df| dft8dd |D dfg}dd |D }|sxtjj9s|D ]}t|tj:rt;| q|	dk	r2|s2|s2|	<t	t
|j= t>|j?||t	t
|j=|||t|j@A ttB| jt|jCd
|_?nt+| |rtjj9r|	dk	st|	jDdk	st|j? tjEj9jF|	jDdddk	st fdd}||_?d|j(kr|j'rt)|j' nt)d|  |s>tG|j?|} | |j?k	r>| |_?tH tIjJd|rRd nd! d"|  d|_K|S )#z
    Inductor API that compiles a single graph.

    If you change the argument list for this function, make sure you
    also update the call to save_args_for_compile_fx_inner below accordingly.
    r   )_LazyGraphModuleNzGinductor can only compile FX graphs which return a tuple/list, but got )
r   r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   r   r   rr   T)localremotez%FX codegen and compilation took %.3fszskipping cudagraphs due to inductorZcudagraph_skipsr-   c                 S   s&   g | ]}t |tjjjr|jnd qS r@   )rN   rO   fxr   Nodestack_trace)rI   argrC   rC   rD   rM   ,  s   z$compile_fx_inner.<locals>.<listcomp>c                 s   s"   | ]}t |tjrt|V  qd S r@   )rN   rO   rP   rd   r   rC   rC   rD   r   1  s   z#compile_fx_inner.<locals>.<genexpr>)3check_for_mutation_ignore_cuda_graph_managed_tensorzmutated inputszincompatible opszcomplex memory overlapc                 s   s    | ]}t |tjtjfV  qd S r@   )rN   rO   rP   SymIntr   rC   rC   rD   r   M  s    znon-Tensor inputsc                 S   s   g | ]\}}|s|qS rC   rC   )rI   bsrC   rC   rD   rM   S  s      )r   device_indexstack_tracesr   r   	constantsplaceholdersmutated_input_idxsF)Zcreate_if_none_existsc                    s       | S r@   )Zset_to_running_backward
new_inputsZcompiled_graph_callablemanagerrC   rD   compiled_artifact  s    z+compile_fx_inner.<locals>.compiled_artifactztorchinductor done compiling 	BACKWARDSFORWARDS graph )LrA   Zcount_callsr~   Ztorch.fx._lazy_graph_moduler   Zforce_recompiler   forwardrN   nextiterreversedr   r   r   r]   r   r   Z	save_argsr   r   tritonr   timer   get_input_idxs_to_checkr   Zfx_graph_cacher   rO   rP   r   r   Z_is_inductor_staticr   loadfx_codegen_and_compilero   debugZdisabled_cudagraphs_reasonZdevice_typesr   r   disablere   rf   rg   output_stridesr^   extendr:   r   Z cudagraph_support_input_mutationtorch._inductor.cudagraph_utilsr   r8   allcudagraph_treesr   intsetZdevice_idxscudagraphifyZcurrent_callabler   valuesr   r   value	_inductorZget_manageralign_inputs_from_check_idxsrp   r   INFO_boxed_call)!r   r   r   r   r   r   r   r   r   r   r   r   r   r   Zgraph_kwargsstartr   inputs_to_checkrJ   inputcompiled_graphrk   outputr   Zcomplex_memory_overlap_inputsr   Zhas_mutation_strZhas_mutationZcudagraph_testsZcudagraph_fail_reasonsrL   r   Znew_callablerC   r   rD   compile_fx_inner  s^   

 





 


  


  r   )r   r   r   r   r   r   r   r   r   r   r   r   rU   c           !         s  t  rt  ttt d t tjd|r4dnd d|  t	j
 | t|}t  t  t |}W 5 Q R X t	|h t |d t	j
 | t
dtd d	d	d
 td fddd t rtdtt id W 5 Q R X t	| t| d }d }d }|rtjjrt \}}t |g |||||	||d	d
}t	!|( |spt"d|#  |$ \}}W 5 Q R X t  ||||||	|||||d}t%& }t	!| |j#|  g }|j'd k	r6|j'D ]P}t(|dr(t)t*|j+j,dkr(|-t.dd |j+j,D  n
|-d  qt/| |0 }|1 \}}}t% j2|7  _2t% j3|7  _3t% j4|7  _4|r@tj5j6r@t	j7j8s@tj9j:j;| r@d } j7j<D ]\}|j=>dd }|j?dkst@|tjArtj9j:;|sq|j=>dd  }r qqd}|r.| d| d}n
| d}|t	j7_8t	jBd	krt|W  5 Q R  W  5 Q R  W  5 Q R  S |rt	j7j8sddlCmD} |t	j7jEt	j7_8tF|||t	j7j8|G } W 5 Q R X W 5 Q R X W 5 Q R X | S )Ni  ztorchinductor compiling r   r   r  r   z%szAFTER POST GRADT)include_strideinclude_deviceZinductor_post_grad_graphc                      s    j ddddS )NFT)Zprint_outputr!  r"  )Zprint_readablerC   r   rC   rD   <lambda>  s     z(fx_codegen_and_compile.<locals>.<lambda>)Z
payload_fnZpt2_configs)Zextra_logging)	r   	shape_envr   r   r   r   r   r   Zis_const_graphz"AOT mode only supports C++ wrapper)r   r$  r   r   r   r   r   r   r   
const_codeZconst_modulelayoutr   c                 s   s   | ]}t jj|V  qd S r@   )r;   r~   ZsizevarsZ	size_hint)rI   r   rC   rC   rD   r   7  s    z)fx_codegen_and_compile.<locals>.<genexpr>r   rz   r   zWgraph with symbolic shapes inputs and config.triton.cudagraph_skip_dynamic_graphs=True.z Found from 
) check_lowering_disable_cudagraph)Hr   rw   syssetrecursionlimitmaxgetrecursionlimitrp   r   r  r;   r  Zfx_graph_shape_env_from_inputsr3   rO   Zno_gradr   set_fake_moder   Zfx_graph_transformedpost_grad_graphs_logr   r#   r   r   r<   strr   r   aot_inductorZuse_runtime_constant_foldingr   r5   Zset_graph_handlerr   runZcodegen_with_cpp_wrapperr.   ZCachedMetricsHelpergraph_outputshasattrr^   r'   r&  rF   r   r   _check_triton_bf16_supportZcompile_to_fnZcount_bytesZnum_bytes_accessednode_runtimesnodes_num_elemr  Zcudagraph_skip_dynamic_graphsr~   Zdisable_cudagraphs_reasonr  r   Zany_is_symbolicr   r   r   r{   rN   rP   aot_compilationr  r(  Zdevice_node_mappingr   Z
get_deltas)!r   r   r   r   r   r   r   r   r   r   r   r   r$  r   r   Zconst_graphr%  r   rZ   r~   Zmetrics_helperr  r   compiled_fnZ	num_bytesr7  r6  r   r   Zmeta_valr  r(  r  rC   r   rD   r
    s   
   





("r
  )r[   c                 C   sN   t dd t|  |  D d }t| |fd }t||  |  S )Nc                 s   s   | ]\}}|d  | V  qdS r-   NrC   rI   r\   rF   rC   rC   rD   r   w  s     z)clone_preserve_strides.<locals>.<genexpr>r-   )r-   )sumr`   rG   rF   rO   
as_stridedcloner[   Zneeded_sizer   rC   rC   rD   r   u  s    "r   )r   check_inputs_idxsrU   c                 C   s.   |D ]$}| |   t rt| | | |< qd S r@   )data_ptr	ALIGNMENTr   )r   r@  rJ   rC   rC   rD   copy_misaligned_inputs}  s    rC  )inputsr   rU   c              
   C   s   g }t | D ]p\}}t|tjs"q|jjdkr0qt 6 ||krTt|rTW 5 Q R  qt|shW 5 Q R  qW 5 Q R X |	| q|S )z
    This function runs at compile time, and generates a list of indices for which we
    might need to do a copy to preserve alignment requirements.
    rr   )
r   rN   rO   rP   r   r   r9   r"   r!   r   )rD  r   Zids_to_checkrJ   r  rC   rC   rD   r    s    r  )modelr  c                    s"   t  dkrS  fdd}|S )Nr   c                    s   t |   | S r@   )rC  r   r  rE  rC   rD   r2    s    
z)align_inputs_from_check_idxs.<locals>.run)r^   )rE  r  r2  rC   rF  rD   r    s    r  rC   )r   r   r   .)
rE  rD  r   r   r   r   r   r   r   r   c             
      sl   ddl m}
 tjjr0tj|
|||||||	dnttdd |D sR|S d   fdd}|S )Nr   )cudagraphify_impl)r   r   r   r   r   r   r   c                 s   s   | ]}t |tV  qd S r@   )rN   r%   )rI   inprC   rC   rD   r     s     zcudagraphify.<locals>.<genexpr>c              	      s0    d kr(t   |  W 5 Q R X  | S r@   )rA   preserve_rng_stater   r9  Zcudagraphify_fnrE  r   rC   rD   r2    s    
zcudagraphify.<locals>.run)Ztorch._inductor.cudagraph_treesrG  r   r  r  	functoolspartialr   )rE  rD  r   r   r   r   r   r   r   r   Znew_cudagraphify_implr2  rC   rJ  rD   r    s$    r  )rD  r   c                 C   sV   g }t || D ].\}}t|tjr| t dkr|| qt|t|krR|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )r`   rN   rO   rP   rA  rB  r   r^   )rD  r   Zaligned_static_input_idxsr   r  rC   rC   rD   remove_unaligned_input_idxs  s    rM  c                 C   sN   t dd t|  |  D d }tj|| j| jd}t||  |  S )z1
    Copy and input while preserving strides
    c                 s   s   | ]\}}|d  | V  qdS r:  rC   r;  rC   rC   rD   r     s     zstatic_input.<locals>.<genexpr>r-   )r   r   )	r<  r`   rG   rF   rO   emptyr   r   r=  r?  rC   rC   rD   static_input  s    "rO  dstsrcrT   c                 C   s"   t | |} t ||}| | dS )z=Index into expanded dimensions of both dst and src then copy_N)rY   Zcopy_rP  rC   rC   rD   index_expanded_dims_and_copy_  s    

rS  )rE  rD  r   c           	   	      s  t |}t|t|| t|ts,tfddt|D fddt|D tt|D ]0\}\}}t|tj	rf|krft
| || qftj  tj }|tj  tj| | t W 5 Q R X |  tj | tj  tj tjj|dd | tW 5 Q R X tttfsLftjrjfdd}n.fddttD   fd	d}t||S )
zQ
    Assumes inputs[static_input_idxs[i]] are always the same memory address
    c                    s$   g | ]\}}| krt |ng qS rC   )rS   r   r   rC   rD   rM     s   z%cudagraphify_impl.<locals>.<listcomp>c                    s8   g | ]0\}}t |tjs|n| kr,t|n| qS rC   )rN   rO   rP   rO  detachr   rT  rC   rD   rM   #  s   

Zthread_local)streamZcapture_error_modec                    s   t t | ksttt| D ]F\}\}}}t|tjs@q$|kr^| | ksjtq$t||| q$| 	   
  S r@   )r^   r   r   r`   rN   rO   rP   rA  rS  clearreplay)r   r   rQ  rR  rT   )r~   inps_expanded_dimsr   static_inputsstatic_outputsrC   rD   r2  E  s    
zcudagraphify_impl.<locals>.runc                    s   g | ]}| kr|qS rC   rC   )rI   r   rT  rC   rD   rM   X  s     c                    s:    D ] }| }t | | | | q|     S r@   )rS  rW  rX  )r   r   rT   )copy_indicesr~   rY  rZ  r[  rC   rD   r2  \  s      )r  rM  rC  rN   r]   r   r   r`   rO   rP   rS  rr   ZsynchronizeZStreamZwait_streamZcurrent_streamrV  Z	CUDAGraphr~   r   r   Zsize_assertsrQ   r^   r  )	rE  rD  r   Zcheck_input_idxsr   r[   rT   rV  r2  rC   )r\  r~   rY  r   rZ  r[  rD   rG    sB    












rG  )model_example_inputs_inner_compiler   c              
   C   s   |d krddin
|ddi}d|kr>t jjs>|dt| ji}|dd }tdH t| |t	j
|d|d|d}tj|std| |W  5 Q R  S Q R X d S )Nr   Tzaot_inductor.output_pathr   )r   r   )r_  r   z/AOTInductor compiled library does not exist at )r   r1  Zoutput_pathr   codepopr;   Zset_aot_compilation
compile_fxrK  rL  ospathexistsr   )r]  r^  r_  r   r   Zcompiled_lib_pathrC   rC   rD   compile_fx_aoti  s<    

 
rf  )aot_autograd_modelaot_example_inputsdynamo_modelnum_example_inputsr_  r   r   forward_devicec                    sh  ddl m}m}	 t|  tj| dd}
|
r>t|  d ||  |	||  \} fddD  t| }t }|j	j
^ }}|jd }tdd |D }tt|}tjj }|d k	r
|j}|d k	sttt|D ]}|krd ||< q|jr
||jj7 }tj|d	d  || |||d||
|d
	W 5 Q R X tjdkrPS fdd}d|_|S )Nr   )%convert_conv_weights_to_channels_lastfreezeTr   c                    s   g | ]} | qS rC   rC   )rI   ind)rh  rC   rD   rM     s     z(fw_compiler_freezing.<locals>.<listcomp>c                 s   s"   | ]}t |tjjr|jV  qd S r@   rN   rO   r   r   r   rI   r   rC   rC   rD   r     s     z'fw_compiler_freezing.<locals>.<genexpr>r   )r   r   r   r   r   r   r   c                    s"    fddD }    |S )Nc                    s   g | ]} | qS rC   rC   rH   r   rC   rD   rM     s     z9fw_compiler_freezing.<locals>.wrapper.<locals>.<listcomp>)rW  )r   Zargs_new)optimized_functionpreserved_arg_indicesrq  rD   wrapper  s    z%fw_compiler_freezing.<locals>.wrapper)Ztorch._inductor.freezingrl  rm  r   r5   Zdecide_layout_optr   r^   r   r~   r   r   dictfromkeysr]   rQ   rO   re   rf   rg   params_flatr   rh   ri   r   r   r   r;   r8  r  )rg  rh  ri  rj  r_  r   r   rk  rl  rm  r   Z	opt_modelrj   r   rZ   model_outputs_nodemodel_outputsr   r   tracing_contextrw  rJ   rt  rC   )rh  rr  rs  rD   fw_compiler_freezing  s\    




r{  )r]  r^  r_  r   decompositionsc                    sV  |r<t |( t|t ||dW  5 Q R  S Q R X t jr(t ddddd t| |}ttjj	rdd j
jD }tdd |D rtt ||D ]6\}}}	|j|	jkrtd	| d
|j d|	j dq|}t|tjdd|dW  5 Q R  W  5 Q R  S Q R X W 5 Q R X tjt|d}
tsNt||
S ttjj	rtj
jtrzt||
S t|tdd |D rt||
S t jrtt|tt jj  t!dt"t#|dk	r|nt$ }t%j&tjj	t'tj( t)d fdd}tj|dd}t j*rRt+ sRtjt, d}ntj|dd}dd }t-ddt%j&tjj	t'tj( d fdd}t.|ptj/j0dd}tj1j23 ptj12|}tj4dkrt5jdd t6|d|d\}}W 5 Q R X t7||}dj8kr.j8d |j8d< tj9: }|rFtj9j;nt<j=}t>|V t?@ B | 0 |||W  5 Q R  W  5 Q R  W  5 Q R  S Q R X W 5 Q R X W 5 Q R X t>| tj1A|x t?@ d t5jddL tB|||||dd |W  5 Q R  W  5 Q R  W  5 Q R  W  5 Q R  S Q R X W 5 Q R X W 5 Q R X W 5 Q R X dS )!z+Main entrypoint to a compile given FX graph)r_  r|  FT)r   ztriton.autotune_cublasLtztriton.cudagraphsztriton.store_cubinc                 S   s"   g | ]}|j d kr|jdqS )rz   r   )r{   r   r   )rI   r   rC   rC   rD   rM      s   
zcompile_fx.<locals>.<listcomp>c                 s   s   | ]}|d k	V  qd S r@   rC   )rI   vrC   rC   rD   r     s     zcompile_fx.<locals>.<genexpr>zBDevice mismatch between fake input and example input at position #z: z vs zx. If the model was exported via torch.export(), make sure torch.export() and torch.aot_compile() run on the same device.)r   c                 s   s   | ]}t |tttfV  qd S r@   )rN   r]   r   ru  )rI   r[   rC   rC   rD   r   .  s     N)rE  r   r   c              
      s  |rt |  tjjt|}i }tjrt| }t	j
|j }t|}tjj }|d k	rn|jrn|sn|jj}	nd}	ttjjrjj^ }
}|jdkstt	|j\}}
t|}n|}||kst|	| }||ksttdd ||	| D }| |t| ||dS )Nr   r  c                 s   s"   | ]}t |tjjr|jV  qd S r@   ro  rp  rC   rC   rD   r   |  s   z7compile_fx.<locals>.fw_compiler_base.<locals>.<genexpr>)r   r   r   r   r   r   )r   rO   r  r   Znum_fw_fixed_argumentsr^   r   Zkeep_output_strider:   r   arg_tree_leavesr   re   rf   rg   rh   Znum_mutated_inp_runtime_indicesrN   r   GraphModuler~   r   r{   r   tree_flattenru  rv  rm   )rE  r   r   rl   r   rx  ry  Znum_model_outputsrk   Zoriginal_output_start_indexrZ   Zorig_model_outputs_nodeZorig_model_outputsZnum_orig_model_outputsZorig_output_end_idxr   rk  r   r_  r]  rj  rC   rD   fw_compiler_base@  sP     


z$compile_fx.<locals>.fw_compiler_baser   )ri  rj  r_  r   r   rk  c                 [   s   t |  t| |f|ddiS )Ncompilerr   )r   r   )r~   Zjoint_inputskwargsrC   rC   rD   partition_fn  s     z compile_fx.<locals>.partition_fnbw_compiler)r   )rE  r   c              
      sZ   i }t jr2t| }tj|j }tdd |D }t| }| |t	t
| d|dS )Nc                 s   s"   | ]}t |tjjr|jV  qd S r@   ro  rp  rC   rC   rD   r     s     z2compile_fx.<locals>.bw_compiler.<locals>.<genexpr>T)r   r   r   r   r   r   )r   Zbw_outputs_user_visibler:   r   r~  r   ru  rv  r   r]   rQ   )rE  r   r   rx  ry  rl   )r   rk  r   r_  rC   rD   r    s$    

zcompile_fx.<locals>.bw_compilerr   )Zunlift_effect_tokens)Ztrace_jointr|  Z dynamo_flat_name_to_original_fqn)fw_compilerr  inference_compilerr|  r  Zkeep_inference_input_mutations)Cr   r   rb  r   r;   Zset_real_inputsrN   rO   r   r  r~   r   r  r`   r   r   
ValueErrorrK  rL  graph_returns_tuplemake_graph_return_tuple_codegenr,   handle_dynamo_export_graphr   r   r   Z_raise_error_for_testingr   r^   r   r  r   r   r  _graph_counterr0   rA   dynamo_timedr   rP   boolZfreezingZis_grad_enabledr{  r&   r   r   r   re   rf   rg   r8  functorch_configr   r   r   Z_CZ_is_any_autocast_enabledZ_DisableAutocastr   r   r.  r   r  tracingr*   )r]  r^  r_  r   r|  Zinputs_Zfake_inputsr   firJ   Zrecursive_compile_fxr  r  r  r  r  r   rz  r   r   r   Zdisable_amprk   rC   r  rD   rb    s   	.

L
(



H
 rb  )rD  c                 C   s@   d }t | }|d k	r|jS | D ]}t|tjr|jj  S qd S r@   )r   r$  rN   rO   r   r   )rD  r$  r   r  rC   rC   rD   r-    s    r-  c                 C   s~   t | tjjsdS t| j\}t |ttfr0dS t |tjjj	rzt
|jdrzt|jjjdkrztdd |jjjD rzdS dS )z"True if a FX graph returns a tupleT_schemar-   c                 s   s   | ]}t |jd kV  qdS )rP   N)r0  r   )rI   retrC   rC   rD   r     s     z&graph_returns_tuple.<locals>.<genexpr>F)rN   rO   r   r  r:   r   r]   r   r   r   r4  r   r^   r  Zreturnsr  )r   rvrC   rC   rD   r     s    
r  )r   rD  
compile_gmc              	      s   t | }|j\}t|\}| j| | j| W 5 Q R X | j| t| sZt	|| | t
  fdd}|S )z
    Mutate gm so it returns a tuple.  This is only needed for graphs
    not created by torchdynamo that return non-tuples.
    c                     s   t  | |S r@   )r   Ztree_unflatten)r   r  r9  specrC   rD   rt  %  s    z(make_graph_return_tuple.<locals>.wrapper)r:   r   r   r  r~   Zinserting_beforer  r   r  r   rK  wraps)r   rD  r  r   r  rt  rC   r  rD   r    s    	
r  c                    sL   | j j tjj  | j _|   ||  j| t fdd}|S )z
    `torch._dynamo.export` embeds pytrees in the FX graph codegen object,
    convert that to a normal FX graph so inductor can compile it.
    c                     s      j|   S r@   )Zprocess_outputsprocess_inputsrq  Zcodegenr9  rC   rD   rt  ;  s    z+handle_dynamo_export_graph.<locals>.wrapper)	r~   r  rO   r   ZCodeGenr   r  rK  r  )r   rD  r  rt  rC   r  rD   r  ,  s    	r  )r~   rU   c                 C   s   d ddd}| j  D ]L}t|ddd  }|jdks| tjkrHqtjjdd	r\ d S || q| j	D ]L}t|dd
d  }|jdksl| tjkrqltjjdd	r d S || qld S )Nr   c                 S   s6   ddl m} tj| }t|j d |dd S )Nr   )	SkipFramez9 does not support bfloat16 compilation natively, skippingzBF16 is not supported)Ztorch._dynamo.excr  rO   rr   Zget_device_propertiesru   rv   r   )r   r  Zdevice_propsrC   rC   rD   warn_and_skipC  s    
z1_check_triton_bf16_support.<locals>.warn_and_skipZ
get_devicec                   S   s
   t dS Nr   rO   r   rC   rC   rC   rD   r#  M      z,_check_triton_bf16_support.<locals>.<lambda>rr   F)Zincluding_emulationc                   S   s
   t dS r  r  rC   rC   rC   rD   r#  W  r  )
Zgraph_inputsr  r   r   Z	get_dtyperO   Zbfloat16rr   Zis_bf16_supportedr3  )r~   r  rH  r   r   rC   rC   rD   r5  B  s    	

r5  )F)F)N)NNFNFFFNNNN)
NNFNFFFNNN)rC   )rC   )r   rK  r   r   rc  r)  r  ru   r   typingr   r   r   r   r   r   r	   r
   Zunittestr   Ztorch._inductor.async_compilerO   Ztorch.fxZtorch.utils._pytreer   Z_pytreer   Zfunctorch.compiler   Ztorch._dynamor   r   Zdynamo_configrn   rA   Ztorch._dynamo.utilsr   r   r   r   Ztorch._functorchr  Ztorch._functorch.aot_autogradr   r   Ztorch._inductor.codecacher   r   r   r  r   r   r   Ztorch._inductor.debugr   Ztorch._inductor.utilsr   r   r    r!   r"   Ztorch._loggingr#   Z
torch._opsr$   Ztorch._subclasses.fake_tensorr%   Ztorch._utils_internalr&   Z%torch.fx.experimental.symbolic_shapesr'   Z torch.fx.passes.fake_tensor_propr(   Z_dynamo.backends.commonr*   Zfx._lazy_graph_moduler+   Zfx.graphr,    r.   r  r/   decompositionr0   Zfx_passes.joint_graphr1   Zfx_passes.post_gradr2   r3   Zfx_passes.pre_gradr4   r~   r5   Zirr6   r7   r8   r9   r:   Zvirtualizedr;   r   Ztorch._inductor.fb.utilsr<   r=   r0  	getLogger__name__ro   Z_loggingZgetArtifactLoggerr   r/  rB  rS   rP   r  rY   r  rd   rm   	lru_cacherp   rw   r   r   r   r   r   r   r  r   r   r   r   r   r   r  r   wrapZ_python_dispatchZ_disable_current_modesZuse_lazy_graph_moduler  r   rI  r
  r   rC  r  r  r   r  rM  rO  rS  rG  rf  r  r{  rb  r-  r  r  r  r5  rC   rC   rC   rD   <module>   s  (



B	9 

           

 z          

 O	 	$  	

1 ]
&
P
  


