U
    zhk                    @  s	  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZm Z  d dlm!Z! d dl"m#Z# d dl$m$Z$m%Z% d d	l&m'Z' d d
l(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4 d dl5Z5d dl6m7Z7m8Z8 d dl9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@mAZAmBZB d dlCmDZD d dlEmFZFmGZGmHZH d dlImJZJ d dlKmLZLmMZMmNZN d dlOmPZPmQZQmRZR e3r<d dlSmTZT d dlUmVZV d dlWmXZX d dlYmZZZ ej[\e]Z^ej[_ej[_e^Z`ej[ae`dZbejdkZce:d rd dlemfZf d dlgmhZh d dlimjZjmkZkmlZlmmZm n&dd Zjdd  Zkd!d" Zld#d$d%d&Zme5jnoepd'Zqd(ZrejdkZce
sepZtd)d)d*d+d,Zud-d. ZvG d/d0 d0ZwG d1d2 d2ewZxG d3d4 d4ewZyd)d$d5d6Zzd7d)d8d9d:Z{dd<d)d=d>d?Z|dd)d)d)d@dAdBdCZ}dd<d)d)dEdFdGZ~dd<d)d)d)d)dHdIdJdKZd)d)dLdMdNZd d)d<d#dPdQdRdSZejG dTdU dUZdVdVdWdXdYZdZd[ Zd\d] Zd^d_ Zd`da Zdbdc ZG ddde deejZdfdg Zd!didjZeddkdl Zdmdn ZejG dodp dpZG dqdr dreZG dsdt dtZdudvdwdxd)dydzd{ZG d|d} d}ZejG d~d dZd)d$ddZedd)d)dddZd)d$ddZd#d$ddZedd#d$ddZd#d$ddZdd Zd)d)dddZG dd dZejG dd deZejG dd deZejG dd deZejG dd deZG dd deZdd$ddZe Ze e e gZeddd$ddZdd$ddZd"d#d)dddZd#d#d#d)dddZd$d#d)dddZd)d$ddZd)d$ddZd)d$ddZd)d$ddZd)d$ddZd)d$ddZd)d$ddZedd#d$ddZeddd$ddÄZdPd$ddńZddǄ ZddɄ ZdOedOdOfd#dd#d#dd˜dd̈́ZdddOedOdOdOdOdOdhf
dd)d#d#d#dd#d#d#d#d#dd)dМdd҄Zd)dӜddՄZedd)dHd֜dd؄ZeGG ddڄ dڃZG dd܄ d܃ZeGejd)d$ddބZd)d$ddZe8dd)ddPdddZdaded< d)dddZeGG dd dZdd ZeGG dd deƃZeGG dd deȃZdddddZd#d#d#d#d#dddZdd ZeGG dd deȃZdd Zdd ZeGG dd  d ZG dd dZѐdd$ddZdd$ddZdd$dd	Zdd$d
dZdd$ddZ֐d%dd)d)dd)dddZG dd dZeGG dd dZG dd dZG dd deڃZG dd deڃZdS (&      )annotationsN)bisect_right)copy)c_void_pcdllCDLL)partial)Path)timetime_ns)
ModuleType)AnyCallablecastDict	GeneratorListOptionalSequenceSetTupleTYPE_CHECKINGUnion)countersdynamo_timed)configexcmetrics)cuda_env)_module_to_triton_kernel_reload_python_module _reload_python_module_in_subproc)	cache_dir)ALIGN_BYTESclear_on_fresh_inductor_cacheis_linux)trace_structured)extract_tensor_metadata
FakeTensorTensorMetadata)has_hinthint_intShapeEnv)Future)GraphLowering)ChoiceCaller)
HalideMetaz_inductor/script.ldwin32)build_paths)_run_build_command)log_global_cache_errorslog_global_cache_statslog_global_cache_valsuse_global_cachec                  O  s   d S N argskwargsr9   r9   K/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/codecache.pyr4   `   s    r4   c                  O  s   d S r8   r9   r:   r9   r9   r=   r5   c   s    r5   c                  O  s   d S r8   r9   r:   r9   r9   r=   r6   f   s    r6   boolreturnc                   C  s   dS NFr9   r9   r9   r9   r=   r7   i   s    r7   Zoutput_codeiX  str)namer@   c                 C  sz   t jjd krdndt jjdd }dtjj tjj }| d| }tj	
t |}tj	
|| }tj|dd |S )	Ncpucu. py_Texist_ok)torchversioncudareplacesysversion_infomajorminorospathjoinr"   makedirs)rC   Zcu_strpython_versionZbuild_folderZcpp_wrapper_dirZcpp_wrapper_build_directoryr9   r9   r=   cpp_wrapper_cache_dirw   s    
rY   c                   C  s   t jjd krdS dS )NZ
cubin_pathZ
hsaco_path)rL   rM   hipr9   r9   r9   r=   get_cpp_wrapper_cubin_path_name   s    r[   c                   @  s   e Zd ZeedddddZeeedddddZeedd	dd
dZ	ddddZ
ddddZdddddZdS )	CacheBaseNDict[str, Any]r?   c               	   C  s   zddl m}  |  }W n tk
r.   d }Y nX z,dtjtj jitjj|dd}W n t	t
fk
rx   i }Y nX ttj|ddd |d	< |S )
Nr   )
triton_keyrC   )rN   Ztriton)ZdevicerM   T)	sort_keysutf-8hash)Ztriton.compiler.compilerr^   ModuleNotFoundErrorrL   rN   Zget_device_propertiesZcurrent_devicerC   rM   AssertionErrorRuntimeErrorhashlibsha256jsondumpsencode	hexdigest)r^   Ztriton_versionsystemr9   r9   r=   
get_system   s*    

 

zCacheBase.get_systemr	   c                   C  s   t tjt dt d S )Ncachera   )r	   rT   rU   rV   r"   r\   rl   r9   r9   r9   r=   get_local_cache_path   s    zCacheBase.get_local_cache_pathzOptional[Path]c                   C  s*   t jd k	r&ttjt jt d S d S )Nra   )r   Zglobal_cache_dirr	   rT   rU   rV   r\   rl   r9   r9   r9   r=   get_global_cache_path   s    zCacheBase.get_global_cache_pathNonec                 C  s   t  | _d S r8   )r\   rl   rk   selfr9   r9   r=   __init__   s    zCacheBase.__init__c              	   C  s:   |   }| si S t|}t|}W 5 Q R X |d S Nrm   )rn   is_fileopenrg   load)rr   local_cache_pathZlocal_cache_fplocal_cacher9   r9   r=   get_local_cache   s    
zCacheBase.get_local_cache)ry   r@   c                 C  s0   |   }tt|tj| j|ddddd d S )N)rk   rm      )indentT	make_dirs)rn   write_atomicrB   rg   rh   rk   )rr   ry   rx   r9   r9   r=   update_local_cache   s    zCacheBase.update_local_cache)__name__
__module____qualname__staticmethod	functools	lru_cacherl   r$   rn   ro   rs   rz   r   r9   r9   r9   r=   r\      s    r\   c                   @  s.   e Zd ZdddddZddddd	d
ZdS )
LocalCacherB   Optional[Dict[str, Any]])keysr@   c                 G  s2   |   }|}|D ]}||kr&|| }q d S q|S r8   )rz   )rr   r   rm   	sub_cachekeyr9   r9   r=   lookup   s    
zLocalCache.lookupr   rp   )r   valuer@   c                G  sL   |   }|}|dd D ]}||i  || }q|||d < | | d S )Nr   )rz   
setdefaultr   )rr   r   r   rm   r   r   r9   r9   r=   	set_value   s    
zLocalCache.set_valueN)r   r   r   r   r   r9   r9   r9   r=   r      s   r   c                   @  s4   e Zd Zeddd Zddddddd	d
ZdS )PersistentCacheNc              	   C  sB   |   }|d ks| si S t|}t|}W 5 Q R X |d S rt   )ro   ru   rv   rg   rw   )rr   Zglobal_cache_pathZglobal_cache_fpZglobal_cacher9   r9   r=   get_global_cache   s    
z PersistentCache.get_global_cachezList[ChoiceCaller]rB   z4Optional[Callable[[Any], Dict[ChoiceCaller, float]]]zDict[ChoiceCaller, float])choicesopinputs	benchmarkr@   c              
     s  t  tt| j}tt| j}tt| j}i ddd fdd}tjsltj	rvtj
rz|  ni }	||	st r||  |ds|dk	rzt| tfdd D st|	i  |	 i i   D ] \}
}||	   |
 < qW n0 tk
rN } z|| |W 5 d}~X Y nX | |	 fd	d
 D }|| nt r||  |d S )aG  
        Check to see if we have benchmarked the given choice callers. For each
        choice caller:

            1. Check global_cache[op][inputs][choice][precision], return benchmark if cached.
            2. Check local_cache[op][inputs][choice][precision], return benchmark if cached.
            3. If benchmark is not None:
                a. `max_autotune_gemm=True`: benchmark the choice, update
                    local_cache[op][inputs][choice], and return the benchmark.
                b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing.
        Nr>   r?   c                   sj   d} D ]N}|  }|| i i i krN|    | |< qd} qXq|rf||d |S )z2Check if `cache` contains data for all the choicesTF)cached)hash_keyget)rm   callbackhitchoiceZchoice_hashr   r   r   	precisiontimingsr9   r=   check_cache  s     
z+PersistentCache.lookup.<locals>.check_cache)r   c                 3  s   | ]}| kV  qd S r8   r9   .0r   r   r9   r=   	<genexpr>,  s     z)PersistentCache.lookup.<locals>.<genexpr>c                   s   i | ]}|   | qS r9   )r   r   r   r9   r=   
<dictcomp>8  s     z*PersistentCache.lookup.<locals>.<dictcomp>)N)rL   Zget_float32_matmul_precisionr   r5   rk   r6   r4   r   Zmax_autotuneZmax_autotune_gemmZautotune_local_cacherz   r7   r   allrc   r   itemsr   rd   r   )rr   r   r   r   r   Z	log_statsZlog_valsZ
log_errorsr   ry   r   ZtimingeZtimings_to_logr9   r   r=   r      sP        


zPersistentCache.lookup)r   r   r   r   r   r   r   r9   r9   r9   r=   r      s   
r   c                  C  s.   t jt d} t j| s*t j| dd | S )NlocksTrJ   )rT   rU   rV   r"   existsrW   )lock_dirr9   r9   r=   get_lock_dirD  s    r   bytes)datar@   c                 C  s&   t t|  d d d S )N3   r`   )base64	b32encodere   rf   digestdecodelower)r   r9   r9   r=   sha256_hashK  s    r   rG   zUnion[str, bytes])codeextrac                 C  s>   t | tr| n| d}|dkr2|d |d }dt| S )Nr`   rG   s   ||c)
isinstancer   ri   r   )r   r   Zhashing_strr9   r9   r=   	code_hashP  s    r   Tuple[str, str, str])basename	extensionspecified_dirr@   c                 C  sb   |r(t j|r|}q@t jt |}nt jt | dd }t j||  d| }| ||fS )N      rF   )rT   rU   isabsrV   r"   )r   r   r   subdirrU   r9   r9   r=   get_pathW  s    r   r   contentr   	hash_typec                 C  s8   |dkrt | |S |dkr&t t| S td| d S )Nr   )cubinhsacozUnknown hash type )r   reprrc   r   r9   r9   r=   get_hashe  s
    
r   Tuple[str, str])r   r   r   r   r   r@   c           	      C  sD   t |  ||}t|||\}}}tj|s<t|| dd ||fS )NTr}   )r   stripr   rT   rU   r   r   )	r   r   r   r   r   r   r   r   rU   r9   r9   r=   writem  s
    
r   )textr@   c                 C  s   t | dd S )zT
    Write the `text` to a file and return the path computed based on the hash.
    txtr   r   )r   r9   r9   r=   
write_text~  s    r   Frp   )rU   r   r~   r@   c              	   C  s   t |ttfstdt| } |r2| jjddd | jdt  dt	
  d }t |tr`dnd}||}|| W 5 Q R X ||  d S )Nz6Only strings and byte arrays can be saved in the cacheTparentsrK   rF   z.tmpwwb)r   rB   r   rc   r	   parentmkdirrT   getpid	threading	get_identrv   r   rename)rU   r   r~   Ztmp_pathZ
write_modefr9   r9   r=   r     s      r   c                   @  s"   e Zd ZU dZded< ded< dS )TensorMetadataAndValueszk
    TensorMetadata plus the elements as a list of raw values.
    Used for hashing inlined constants.
    r)   Ztensor_metadata	List[Any]valuesNr   r   r   __doc____annotations__r9   r9   r9   r=   r     s   
r   r   )xr@   c                 C  s   | S r8   r9   r   r9   r9   r=   _ident  s    r   c                 C  s&   t | }t| ds"tj|ddd}|S )zs
    Extracts the tensor metadata and removes fields of the TensorMetadata
    that are not needed for caching
    Z_is_inductor_staticr   N)Zstorage_offsetZstorage_bytes)r'   hasattrdataclassesrO   )tmetar9   r9   r=   %extract_tensor_metadata_for_cache_key  s    
r   c                 C  s   t | }t|ffS )zH
    See FxGraphCachePickler. Custom reducer to pickle FakeTensors.
    )r   r   )r   metadatar9   r9   r=   _reduce_fake_tensor  s    r   c                 C  sV   | j r
tt }|  }t | }|dkr>td|dd t| }tt||ffS )a4  
    See FxGraphCachePickler. Custom reducer to pickle Tensors.
    If we see tensors, we know they're constants stored as attributes on
    the GraphModule. Include the values in the key calculation. Small
    tensors will be inlined, so we can't serve the same cache entry for
    different values anyway. Large constants are treated as parameters,
    so we could conceivably reuse a cache entry. To do that, however,
    PyCodeCache would need more complexity to create a new module from its
    cache, but with the right constants attached as attributes.
    g      ?z1FX graph cache handling of a large constant took z.1zs. Please file an issue.)		is_mkldnnBypassFxGraphCacher
   tolistwarningswarnr   r   r   )r   startr   elapsedr   r9   r9   r=   _reduce_tensor  s    
r   c                 C  s   t t| ffS )zD
    See FxGraphCachePickler. Custom reducer to pickle SymInts.
    )r   rB   sr9   r9   r=   _reduce_symint  s    r   c                 C  s   t dS )z
    See FxGraphCachePickler. Custom reducer to handle any objects that we don't
    support and therefore raise to bypass caching.
    N)r   r   r9   r9   r=   _reduce_unsupported  s    r   c                   @  s   e Zd ZdZej Zeee< e	ee
j< eee
j< eee
jjjj< eddddZedddd	d
ZedddddZdS )FxGraphCachePicklera:  
    Custom pickler to customize the pickling of some objects (Tensors), only for the
    purpose of computing a hash for keying into the FxGraphCache. Tensors contain
    objects that don't pickle and/or vary between runs, and we want to capture the
    data that allow us to compute a stable, but safe hash.
    r   r?   c                 C  sz   t  h}| |}z|| W n: ttfk
rZ } ztjddd t|W 5 d}~X Y nX | W  5 Q R  S Q R X dS )zA
        Pickle an object using the FxGraphCachePickler.
        zCan't pickleTexc_infoN)	ioBytesIOdump	TypeErrorAttributeErrorlogwarningr   getvalue)clsobjstreamZpicklerr   r9   r9   r=   rh     s    
zFxGraphCachePickler.dumpsr   rB   )r  r@   c                 C  s   |  |}t|S )zt
        Serialize an object using the FxGraphCachePickler and return a hash
        of the pickled object.
        )rh   r   )r  r  Zserialized_datar9   r9   r=   r   
  s    
zFxGraphCachePickler.get_hash)inpr@   c           
      C  s   dddd}g }t | D ]\}}t|tr|tt|D ]<}| || }|d| d| d| d|||   q<qt|tr| D ]8\}}	| |	}|d| d| d| d||	  qq| |}|d| d| d||  qd		|S )
z
        Get a printable string describing in more detail all the attributes
        comprising an object. Useful for debugging when one graph hashes
        to a different value than another.
        rB   r?   c                 S  s2   t | tjrtt| S t | tr&dS t| S d S )Nz<bytes>)r   rL   TensorrB   r   r   )r  r9   r9   r=   get_str  s
    
z.FxGraphCachePickler.debug_str.<locals>.get_str[z] z]: z: 
)
varsr   r   listrangelenr   appenddictrV   )
r  r  r	  linesattrr  iihkvr9   r9   r=   	debug_str  s    
.

*
"zFxGraphCachePickler.debug_strN)r   r   r   r   copyregdispatch_tabler   r   r(   r   rL   r  r   SymIntr   ZfxZexperimentalZ_backward_stateZBackwardStateclassmethodrh   r   r  r9   r9   r9   r=   r     s   



r   c              
   C  s   t t| |dd dD ]}|j|jd }|d k	s8t|j}|d k	sJtt|d&}|	|j
d |	|  W 5 Q R X |jrt|j|j d| qd S )Nc                 S  s   | j S r8   )rC   r   r9   r9   r=   <lambda>4      z!build_code_hash.<locals>.<lambda>r   rbr`   rF   )sortedpkgutiliter_modulesmodule_finder	find_specrC   rc   originrv   updateri   readispkgbuild_code_hashsubmodule_search_locations)rootsprefixhasherlibspecmoduler   r9   r9   r=   r*  3  s    r*  r9   c              
   C  sh   t  }|tjd t| d| |D ]4}tj	|r*t
|d}||  W 5 Q R X q*| S )Nr`   rG   r   )re   rf   r'  rL   __version__ri   r*  rT   rU   r   rv   r(  r   )r,  extra_filesr.  rU   r   r9   r9   r=   get_code_hashA  s    r4  c                    sL   t  s2tjt d} t g fdd| D S ddlm} |	d
 S )zS
    Compute a key that contains relevant information about torch source files
    )z"codegen/aoti_runtime/interface.cppz'codegen/aoti_runtime/implementation.cppcodegen/cpp_prefix.h	script.ldc                   s   g | ]}t j |qS r9   )rT   rU   rV   r   r   Zinductor_rootr9   r=   
<listcomp>Z  s     ztorch_key.<locals>.<listcomp>r   parutilztorch/src_hash.txt)r   	is_fbcoderT   rU   dirname__file__r4  libfb.pyr;  Zget_file_contentsrstrip)r3  r;  r9   r8  r=   	torch_keyL  s     rA  c                   C  s   t jtS r8   )rT   rU   r=  r>  r9   r9   r9   r=   get_inductor_rootb  s    rB  c                   @  s   e Zd ZU dZded< dS )OrderedSetHolderzb
    See FxGraphHashDetails. Holds a sorted list to support stable hashing
    of set kwargs.
    r   r   Nr   r9   r9   r9   r=   rC  f  s   
rC  c                   @  s   e Zd ZdZdS )r   zI
    Exception to indicate that the FxGraphCache should be bypassed.
    N)r   r   r   r   r9   r9   r9   r=   r   p  s   r   c                   @  s8   e Zd ZdZdgZddddddd	Zd
dddZdS )FxGraphHashDetailszz
    Object to capture all the details for a compiled FX graph relevant to computing
    a safe and stable cache key.
    Zgraph_idtorch.fx.GraphModuleList[torch.Tensor]r]   Sequence[int])gmexample_inputs	fx_kwargsinputs_to_checkc                 C  s   || _ || _i | _t|D ]D}|| jkrt|| tkrPtt|| | j|< q|| | j|< q|| _t	
 t	 t	jjjf| _t	jjjjt	jjjjt	jjjjf| _t | _t | _t | _d S r8   )rH  rI  rJ  r!  EXCLUDED_KWARGStypesetrC  rK  rL   Z$are_deterministic_algorithms_enabledZ-is_deterministic_algorithms_warn_only_enabledutilsZdeterministicZfill_uninitialized_memoryZ!deterministic_algorithms_settingsbackendsrN   matmulZ
allow_tf32Z&allow_fp16_reduced_precision_reductionZ&allow_bf16_reduced_precision_reductionZcuda_matmul_settingsrA  torch_versionr\   rl   Zsystem_infor   Zsave_config_portableZinductor_config)rr   rH  rI  rJ  rK  r  r9   r9   r=   rs     s(    




zFxGraphHashDetails.__init__rB   r?   c                 C  s
   t | S )z
        Get a printable string describing in more detail all the attributes
        comprising this object. Useful for debugging when one graph hashes
        to a different value than another.
        )r   r  rq   r9   r9   r=   r    s    zFxGraphHashDetails.debug_strN)r   r   r   r   rL  rs   r  r9   r9   r9   r=   rD  x  s   +rD  rE  rF  r]   rG  )rH  rI  rJ  rK  r@   c                   s^   t | |||}dt| |  td d   tjjddd  fddd S )	z=
    Generate a unique hash of the FX graph for caching.
    r   z$FX graph cache hash details for key z:
artifactc                   S  s
   dddS )NZfx_graph_cache_hashrg   )rC   encodingr9   r9   r9   r9   r=   r    s    z(compiled_fx_graph_hash.<locals>.<lambda>c                     s   t  ddS )Nr  )r   
components)rg   rh   splitr9   r  r   r9   r=   r    s   )Zmetadata_fn
payload_fn)	rD  r   r   r  r  debugrL   _loggingr&   )rH  rI  rJ  rK  detailsr9   rW  r=   compiled_fx_graph_hash  s    	r\  c                	   @  s   e Zd ZdZeddddZedddddZed	d
dddZeddddZeddddddZ	eddddddZ
eddddZeddddd d!d!d"d#d$Zed%d& Zd'S )(FxGraphCachea7  
    Supports caching and reusing compiled Fx graphs.

    The overall strategy is as follows:
    - This cache stores entries on disk. When saving an entry, we can't
      serialize callables (that could be C++, Triton, etc.), so we serialize
      their own disk cache location. We then recreate the compiled artifact
      after fetching from disk.
    - For indexing the cache, we gather the fields relevant to identifying an
      FxGraph (the graph module, graph inputs, system settings etc.) into an
      FxGraphCacheDetails object, pickle it, and compute a hash for the key.
      See FxGraphCachePickler.
    - Among the metadata we store, we also include a guards expression that's
      appropriate for validating any symbols for Tensor arguments that have
      symbolic bounds. On cache lookup then, we evaluate those guards in the
      current context to validate that a cached entry can be served.
    - A given graph could have multiple compiled versions, corresponding to
      different sets of guards. Therefore, we store cache entries in the form:
          <temp dir>/<fx graph hash>/<serialized metatdata>
    - On lookup, we compute the key from the graph details, iterate over all
      leaf files in the corresponding subdirectory, deserialize the entry, and
      evaluate its guards expression. If the evaluation succeeds, we have a
      cache hit. If it fails, we compile the graph and store a new entry.
    - Finally, on a cache hit, we need to make sure any guards that would
      have been created during compilation are added to the current context.
    rB   r?   c                   C  s   t jt dS )zS
        Get the toplevel temporary directory for storing compiled graphs.
        Zfxgraph)rT   rU   rV   r"   r9   r9   r9   r=   _get_tmp_dir  s    zFxGraphCache._get_tmp_dirr   r@   c                 C  s   t jt | dd | S )zA
        Return the disk location for a given cache key.
        r   r   )rT   rU   rV   r]  r^  r  r9   r9   r=   _get_tmp_dir_for_key  s    z!FxGraphCache._get_tmp_dir_for_keyr   zList[torch.SymInt]r   r@   c                 C  s   dd | D S )z
        Get the backed SymInt objects from the input list. Note that we can never
        have guards that depend on unbacked symint.
        c                 S  s$   g | ]}t |tjrt|r|qS r9   )r   rL   r  r*   r   r   r9   r9   r=   r9    s       z7FxGraphCache._filter_backed_symints.<locals>.<listcomp>r9   )r   r9   r9   r=   _filter_backed_symints  s    z#FxGraphCache._filter_backed_symintszOptional[ShapeEnv]c                  C  s   t jj } | sdS | jjS )zG
        Helper to get the shape env from the tracing context.
        N)rL   Z_guardsZTracingContextZtry_getZ	fake_mode	shape_env)ctxr9   r9   r=   _get_shape_env  s    zFxGraphCache._get_shape_envrF  zOptional[CompiledFxGraph])r   rI  r@   c                   s  t  }|dk	stt |}dd |D }dd fdd}d}| D ]D}	|	js^|	} qt||	j|}
td |	j||
 |
rL|	} qqL|dkrdS t	|j
d	d
 }tj|sLtd d  d7  < ttj|jddd |j}t }tj||kr>||krn*dtj| d}t|d| d|}t||dd zt|j
||j|jj|_W n$ tk
r   td| Y dS X |jrt||j|}|dksttd |j  t!j"#|j$ |S )z
        Lookup a compiled graph in the cache by key. On a hit, return the
        deserialized CompiledFxGraph object. On a miss, return None.
        Nc                 S  s   g | ]}t |qS r9   )r+   rb  r9   r9   r=   r9    s     z.FxGraphCache._lookup_graph.<locals>.<listcomp>z&Generator[CompiledFxGraph, None, None]r?   c               
   3  s   rt  } tj| rtt| D ]X}z0ttj| |d}t	
|V  W 5 Q R X W q( tk
r~   tjddd Y q(X q(rz"  }d k	rt	|V  W n" tk
r   tjddd Y nX d S )Nr   z,fx graph cache unable to load compiled graphTr   )r]  r`  rT   rU   r   r!  listdirrv   rV   picklerw   	Exceptionr  r  r   loads)r   rU   r   r   r   localremote_cacher9   r=   iterate_over_candidates!  s*    
 z;FxGraphCache._lookup_graph.<locals>.iterate_over_candidateszEfx graph cache key %s evaluating guards [%s] with values %s => hit=%srH      inductorZfxgraph_lookup_write_filer   Tr   z#include\s*"[^"]+"
#include "r}   z"Failed to load cached artifact: %sz*fx graph cache key %s post-load guards: %s)%r]  rf  rc   rc  guards_exprr>   Zevaluate_guards_expressionr  rY  r   	cache_keyrT   rU   r   r   r	   r=  r   source_codecpp_prefix_pathr   resubr   PyCodeCacheload_by_key_pathcache_linemap	constantscallcurrent_callableOSErrorerrorguardsr   ZCachedMetricsHelperZapply_deltasmetrics_deltas)r   rI  rl  rm  rd  symintshintsrn  graph	candidater   Zartifact_pathr   Zcpp_pppatterncheckr9   rk  r=   _lookup_graph  sv    


  zFxGraphCache._lookup_graphCompiledFxGraph)r   compiled_graphrI  c                 C  sD  t |}d|_t }|dk	s"tt|}||}	|j||	d|_zt	
|}
W n8 tk
r   tjddd td d  d7  < Y dS X zx|rt| }tj|stj|dd	 tj|t|
}t||
dd
 |rt r|
|d dn|
}|| | W n8 tk
r>   tjddd td d  d7  < Y nX dS )z=
        Store a serialized CompiledFxGraph on disk.
        N)Zplaceholdersr  z1fx graph cache unable to serialize compiled graphTr   rp  Zfxgraph_cache_pickle_errorr   rJ   r}   i@B )r   Ztime_taken_msz!fx graph unable to write to cacheZfxgraph_cache_write_error)r   r~  r]  rf  rc   rc  Zget_pruned_guardsZproduce_guards_expressionrs  rh  rh   ri  r  r  r   r`  rT   rU   r   rW   rV   r   r   r   r<  put)r   r  rI  time_taken_nsrl  rm  Zdisk_compiled_graphrd  r  r  r   r   rU   
cache_datar9   r9   r=   _save_graph  sP    

  
	zFxGraphCache._save_graphrE  )rH  c                 C  sv   t jst jjrtt dkr,td t| j	j
D ]<}t|jtjjrLt|jdkr4tt| |jtjjr4tq4dS )z
        Check some conditions that would preclude caching and raise BypassFxGraphCache
        to bypass in case caching is not possible.
        Nzfx graph cache no shape envgetattr)r   Zfreezingaot_inductorZuse_runtime_constant_foldingr   r]  rf  r  rY  r  nodesr   targetrL   Z_opsZHigherOrderOperatorr   r  _CZScriptObject)rH  noder9   r9   r=   _check_can_cache  s    

 zFxGraphCache._check_can_cachezCallable[..., Any]r]   rG  r>   )compile_fx_fnrH  rI  rJ  rK  rl  remotec                 C  sn  |s|st dd}zt| t||||}d}	|rd}
z6t r\ddlm} ||
}	nddlm	} ||
}	W n& t
k
r   d}	tjddd	 Y nX t||||	}|dkrtd
| td d  d7  < t }| ||f|}t | }t||||||	 n td| td d  d7  < W n> tk
rh   td d  d7  < |sd| ||f|}Y nX |S )z
        Load a compiled graph from the cache. If a cached entry does not exist,
        compile the graph and save it to the cache.
        z(at least one of them needs to be enabledNzfx-graph-v1r   )#FbMemcacheRemoteFxGraphCacheBackend)RedisRemoteCacheBackendzUnable to create a remote cacheTr   zfx graph cache miss for key %srp  Zfxgraph_cache_missr   zfx graph cache hit for key %sZfxgraph_cache_hitZfxgraph_cache_bypass)rc   r]  r  r\  r   r<  Ztriton.runtime.fb_memcacher  Ztorch._inductor.remote_cacher  ri  r  r  r  rY  r   r   r  r   )r  rH  rI  rJ  rK  rl  r  r  r   rm  Zcache_idr  r  
start_timer  r9   r9   r=   rw     sX    

   

	zFxGraphCache.loadc                   C  s,   zt t  W n tk
r&   Y nX dS )z.
        Clear out the on-disk cache.
        N)shutilrmtreer]  r^  FileNotFoundErrorr9   r9   r9   r=   clear(  s    zFxGraphCache.clearN)r   r   r   r   r   r^  r`  rc  rf  r  r  r  rw   r  r9   r9   r9   r=   r]    s&   	wC@r]  c                   @  s   e Zd ZU dZded< ded< ejddZded< d	ed
< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< dZded< ddd ddd!d"d#Z	d$d%d&d'd(Z
dS ))r  zr
    Class holding a compiled FX graph. This is the object serialized on disk
    to support FxGraph caching.
    zOptional[Callable[..., Any]]r~  rB   rt  F)r   ru  Optional[List[Tuple[int, str]]]r{  zSet[str]device_typeszSet[int]device_idxsmutated_inputsmutated_input_idxszDict[str, torch.Tensor]r|  z Dict[str, torch._C.ScriptObject]torchbind_constantsz)Optional[List[Optional[Tuple[int, ...]]]]output_stridesOptional[str]disabled_cudagraphs_reasonzmetrics.CachedMetricsDeltasr  rs  NzOptional[bool]_boxed_callr.   zList[Optional[Tuple[int, ...]]])r~  r  r  r  r  c              	   C  s   || _ |j| _|jr4t|j}| | _W 5 Q R X |j| _|j| _|j| _|j	| _	t
|j| _|j| _|j| _|| _|| _|| _d | _d S r8   )r~  rt  
cache_pathrv   r(  ru  r{  r  r  r  rN  r  r|  r  r  r  r  rs  )rr   r~  r  r  r  r  r   r9   r9   r=   rs   P  s     zCompiledFxGraph.__init__r   r   ra  c                 C  s   | j d k	st|  |S r8   )r~  rc   )rr   r   r9   r9   r=   __call__i  s    zCompiledFxGraph.__call__)r   r   r   r   r   r   fieldru  r  rs   r  r9   r9   r9   r=   r  3  s$   
r  c                  C  sV   t  r$tjjd krt S t S tt j	j
ttfrDtt j	j
} n
t j	j
f} t| S r8   )r   r<  rL   rM   rZ   r2   ccclangr   cppcxxr  tuplecpp_compiler_search)searchr9   r9   r=   cpp_compilern  s    
r  r   )r  r@   c                 C  s   | D ]}zz|d krlt jdkr W qtds.W qddlm} t }|tj|dt	d}| t
 }W 5 Q R X t|dg |W   S  tjttfk
r   Y qY qX qtjd S )NlinuxZTORCH_INDUCTOR_INSTALL_GXXr   FileLockzg++.locktimeout	--version)rP   platformrT   getenvfilelockr  r   rU   rV   LOCK_TIMEOUTinstall_gcc_via_conda
subprocesscheck_outputSubprocessErrorr  ImportErrorr   ZInvalidCxxCompiler)r  r  r  r   lockr9   r9   r=   r  x  s(    

 
r  c               	   C  s   t jt d} t j| dd}t j|std t jdd}|dkrVt	
d}|dk	rtj|dd	|  d
ddddgtjd |S )z>On older systems, this is a quick way to get a modern compilergccbinzg++zDownloading GCC via condaZ	CONDA_EXEcondaNcreatez	--prefix=z--channel=conda-forgez--quietz-yz
python=3.8Zgxx)stdout)rT   rU   rV   r"   r   r  infoenvironr   r  whichr  
check_callPIPE)r-  Zcxx_pathr  r9   r9   r=   r    s*    


r  c                   C  s&   t jdkrt rdS ttdt S )NdarwinFz(gcc|g\+\+)rP   r  is_apple_clangr>   rw  r  r  r9   r9   r9   r=   is_gcc  s    r  c                  C  s*   t  } t| dgd}d| d kS )Nr  utf8ZAppler   )r  r  r  r   
splitlines)r  version_stringr9   r9   r=   r    s    r  c                   C  s"   t jdkrt S ttdt S )Nr  z(clang|clang\+\+)r  r9   r9   r9   r=   is_clang  s    
r  c                 C  s   t rdnd}tj }d|d< z tj| dgtj|dj| }W nr tk
r } zTz tj| dgtj|dj| }W n. tk
r } zW Y 
W Y dS d }~X Y nX W 5 d }~X Y nX |	d	d
}|	dd
}|S )N)oemr9   CLC_ALLz-vstderrenvr  rG   rI   r  )
_IS_WINDOWSrT   r  r   r  r  STDOUTr   ri  rO   )compilerZSUBPROCESS_DECODE_ARGSr  r  r   r9   r9   r=   get_compiler_version_info  s2    
    0r  )	isa_flagsr@   c                 C  s(   t t }tj}| d|  d| }|S )N=)r  r  rL   r2  )r  Zcompiler_inforR  fingerprintr9   r9   r=    _get_isa_dry_compile_fingerprint  s    
r  c                   @  s   e Zd ZU ded< ded< ded< ded< d	Zd
ZddddZejfdddddZ	ddddZ
ddddZddddZedddddZdS )VecISAint
_bit_width	List[str]_macrorB   _arch_flagszDict[torch.dtype, int]_dtype_nelementsa  
#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON)
#include <ATen/cpu/vec/functional.h>
#include <ATen/cpu/vec/vec.h>
#endif

alignas(64) float in_out_ptr0[16] = {0.0};

extern "C" void __avx_chk_kernel() {
    auto tmp0 = at::vec::Vectorized<float>(1);
    auto tmp1 = tmp0.exp();
    tmp1.store(in_out_ptr0);
}
zG
import torch
from ctypes import cdll
cdll.LoadLibrary("__lib_path__")
r?   c                 C  s   | j S r8   )r  rq   r9   r9   r=   	bit_width  s    zVecISA.bit_widthztorch.dtype)dtyper@   c                 C  s
   | j | S r8   )r  )rr   r  r9   r9   r=   	nelements  s    zVecISA.nelementsc                 C  s   | j S r8   )r  rq   r9   r9   r=   build_macro  s    zVecISA.build_macroc                 C  s   | j S r8   )r  rq   r9   r9   r=   build_arch_flags  s    zVecISA.build_arch_flagsc                 C  s   t t| S r8   )ra   rB   rq   r9   r9   r=   __hash__  s    zVecISA.__hash__Nr>   c                 C  s`  ddl m}m} tjjd k	r$tjjS t r0dS ttj	dt
| jd\}}ddlm} t }|tj||d td}| tj|}|| d	d
}	|||g|	|}
zn|
 }tj|s|
 \}}|rW W 5 Q R  d	S tjtjdtjd|gtjtjddtjid W n4 tk
rF } zW Y W 5 Q R  d	S d }~X Y nX W 5 Q R  dS Q R X d S )Nr   )
CppBuilderCppTorchOptionsTr  r   r  .lockr  F)vec_isawarning_all-cZ__lib_path__
PYTHONPATH:r  ) torch._inductor.cpp_builderr  r  r   r  Z
vec_isa_okr<  r   r  	_avx_coder  r  r  r  r   rT   rU   rV   r  r=  Zget_target_file_pathisfilebuildr  r  rP   
executable_avx_py_loadrO   DEVNULLr  ri  )rr   r  r  r   
input_pathr  r   r  
output_dirZbuid_optionsZx86_isa_help_builderoutput_pathstatusZtarget_filer   r9   r9   r=   __bool__  sN    

	"zVecISA.__bool__)r   r   r   r   r  r  r  rL   floatr  r  r  r  r   r   r  r9   r9   r9   r=   r    s   
r  c                   @  sn   e Zd ZU dZdgZejdkr4e dkr4ed dZ	e
jde
jde
jdiZd	d
ddZejZded< dS )VecNEON   ZCPU_CAPABILITY_NEONr  armZAT_BUILD_ARM_VEC256_WITH_SLEEFrG         rB   r?   c                 C  s   dS )NZasimdr9   rq   r9   r9   r=   __str__Y  s    zVecNEON.__str__Callable[[VecISA], Any]r  N)r   r   r   r  r  rP   r  	processorr  r  rL   r  bfloat16float16r  r  r  r  r   r9   r9   r9   r=   r	  P  s   

r	  c                   @  sV   e Zd ZU dZdgZesdndZejdej	dej
diZddd	d
ZejZded< dS )	VecAVX512i   ZCPU_CAPABILITY_AVX512z0-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfmaz/arch:AVX512r      rB   r?   c                 C  s   dS )Navx512r9   rq   r9   r9   r=   r  j  s    zVecAVX512.__str__r  r  Nr   r   r   r  r  r  r  rL   r  r  r  r  r  r  r  r   r9   r9   r9   r=   r  _  s   
r  c                   @  sV   e Zd ZU dZdgZesdndZejdej	dej
diZddd	d
ZejZded< dS )VecAVX2r
  ZCPU_CAPABILITY_AVX2z-mavx2 -mfmaz
/arch:AVX2r  r  rB   r?   c                 C  s   dS )Navx2r9   rq   r9   r9   r=   r  y  s    zVecAVX2.__str__r  r  Nr  r9   r9   r9   r=   r  p  s   

r  c                   @  sR   e Zd ZU dZdddgZdZejdejdej	diZ
dd	d
dZejZded< dS )
VecZVECTORr
  ZCPU_CAPABILITY_ZVECTORzCPU_CAPABILITY=ZVECTORZHAVE_ZVECTOR_CPU_DEFINITIONz-mvx -mzvectorr  r  rB   r?   c                 C  s   dS )NZzvectorr9   rq   r9   r9   r=   r    s    zVecZVECTOR.__str__r  r  N)r   r   r   r  r  r  rL   r  r  r  r  r  r  r  r   r9   r9   r9   r=   r    s   
r  c                   @  sJ   e Zd ZU dZdgZdZi ZddddZdddd	Ze	j
Z
d
ed< dS )InvalidVecISAr   rG   rB   r?   c                 C  s   dS )NZINVALID_VEC_ISAr9   rq   r9   r9   r=   r    s    zInvalidVecISA.__str__r>   c                 C  s   dS rA   r9   rq   r9   r9   r=   r    s    zInvalidVecISA.__bool__r  r  N)r   r   r   r  r  r  r  r  r  r  r  r   r9   r9   r9   r=   r    s   
r  r  c                  C  sb   g } dddddd}t  }|dkr2|dkr2| S tj }tj }|| |d	 || |d
 | S )Nr  r>   rB   destZisa_supportedZisa_namec                 S  s   |r|  | d S r8   )r  r  r9   r9   r=   _check_and_append_supported_isa  s    z8x86_isa_checker.<locals>._check_and_append_supported_isax86_64AMD64r  r  )r  machinerL   rD   Z_is_cpu_support_avx2Z_is_cpu_support_avx512)Zsupported_isar  ZArchr  r  r9   r9   r=   x86_isa_checker  s    

r!  zList[VecISA]c               	   C  s   g } t jdkr&t dkr&| t  t jdkr4| S t dkrtdP}| }|sXqt	d|}|rJ|
 D ] }td|rp| t   qJqpqJW 5 Q R X nPt dkr| t  n6t d	krt }tD ]}t||kr|r| | q| S )
Nr  r  )r  r1   s390x/proc/cpuinfoz^features\s*:\s*(.*)$z[\^ ]+vxe[\$ ]+aarch64)r  r  )rP   r  r  r  r	  r   rv   readlinerw  matchgroupsr  r  r!  supported_vec_isa_listrB   )Zisa_listZ	_cpu_infolineZfeaturesmatchgroupZ_cpu_supported_x86_isaisar9   r9   r=   valid_vec_isa_list  s0    

r,  c                  C  s^   t  rt S t } | stS t jjd kr8| s0t| d S | D ]}t jj| kr<|  S q<tS )Nr   )	r   r<  r  r,  invalid_vec_isar  Zsimdlenrc   r  )Z_valid_vec_isa_listr+  r9   r9   r=   pick_vec_isa  s    
r.  T)compile_onlyr@   c                 C  s   | rdS dS )Nr  rG   r9   )r/  r9   r9   r=   get_compile_only  s    r0  )sharedr/  r@   c                 C  s2   | sdS |rdS t  dkr*dt kr*dS dS d S )NrG   -fPICDarwinr  z'-shared -fPIC -undefined dynamic_lookupz-shared -fPIC)r  rk   r  )r1  r/  r9   r9   r=   
get_shared  s    r4  )r  r@   c                 C  s   | rdS dS )Nz-WallrG   r9   )r  r9   r9   r=   get_warning_all_flag  s    r5  c                   C  s   dt ttjj S )Nz-D_GLIBCXX_USE_CXX11_ABI=)rB   r  rL   r  Z_GLIBCXX_USE_CXX11_ABIr9   r9   r9   r=   get_glibcxx_abi_build_flags  s    r6  c                  C  s$   dddg} t  r| d d| S )N
-std=c++17z-Wno-unused-variablez-Wno-unknown-pragmasz%-Werror=ignored-optimization-argument )r  r  rV   )flagsr9   r9   r=   	cpp_flags  s    

r:  c                   C  s   dS )Nz-D TORCH_INDUCTOR_CPP_WRAPPERr9   r9   r9   r9   r=   cpp_wrapper_flags  s    r;  c                  C  s   t jjrdnd} | d7 } t jjs(| d7 } t jjs8| d7 } t  rD| S tjdkrX| d7 } nt	 dkrn| d	7 } n| d
7 } t  s| d7 } | S )Nz-O0 -gz-O3 -DNDEBUGz" -ffast-math -fno-finite-math-onlyz -fno-unsafe-math-optimizationsz -ffp-contract=offr  z -Xclangppc64lez -mcpu=nativez -march=nativez	 -fopenmp)
r   r  Zdebug_compiler  Zenable_unsafe_math_opt_flagZ#enable_floating_point_contract_flagr<  rP   r  r   )Z
base_flagsr9   r9   r=   optimization_flags  s     


r=  c                   C  s   dS )Nz$-D C10_USING_CUSTOM_GENERATED_MACROSr9   r9   r9   r9   r=   use_custom_generated_macros8  s    r>  c                  C  s<   t  r4d} t }dd}d| d| d|  S dS d S )Nz&-D AOTI_USE_CREATE_TENSOR_FROM_BLOB_V1r8  )z-D C10_USE_GLOGz-D C10_USE_MINIMAL_GLOGz'-D C10_DISABLE_TENSORIMPL_EXTENSIBILITYz-Wp,-fopenmp rG   )r   r<  r2   
openmp_librV   )Zcreate_tensor_from_blob_v1r?  Zpreprocessor_flagsr9   r9   r=   use_fb_internal_macros<  s    r@  c                   C  s   t  rdS dS d S )Nz	-nostdincrG   )r   r<  r9   r9   r9   r=   use_standard_sys_dir_headersO  s    rA  c                  C  sJ   z,d} t |  d}tt|dkW S  t jk
rD   Y dS X d S )Nzconda list llvm-openmp --jsonr  r   F)r  r  rV  r   r  rg   rj  r  )commandoutputr9   r9   r=   is_conda_llvm_openmp_installedV  s    rD  zTuple[bool, str]c                  C  s\   z>t ddg t dddgd } tj| }|| fW S  t jk
rV   Y dS X d S )Nr  Zbrewz--prefixlibompr  )FrG   )r  r  r   r   rT   rU   r   r  )libomp_pathomp_availabler9   r9   r=   homebrew_libomp`  s    
rH  c                   C  s:   t  r6tjjd kr6dtjkr6dtjkr6t tjd< d S )N	CUDA_HOMEZ	CUDA_PATH)	r   r<  rL   rM   rZ   rT   r  r2   rN   r9   r9   r9   r=   _set_gpu_runtime_envt  s    
rJ  c                  C  sj   t td} |  s@t dkr@t td}|jjd  } | d  s`t	dt
|   t
| gS )Nincluder3  stdlibZHeaderszPython.hzCan't find Python.h in )r	   	sysconfigr   r   r  rk   r   absoluter   r   rB   )Zinclude_dirZstd_libr9   r9   r=   _get_python_include_dirs~  s    rO  c                 C  s   t | D ]\}}dtjkr|tjd rtj| dst|D ]@\}}}d|krFtj||| |< | tj| | d  qqFqd S )NrI  z/libcudart_static.azlibcudart_static.aZstubs)		enumeraterT   r  
startswithrU   r   walkrV   r  )lpathsirU   rootdirsfilesr9   r9   r=   _transform_cuda_paths  s    rX  z$Tuple[List[str], str, str, str, str])include_pytorchr  rN   aot_moder@   c                 C  sZ  t   ddlm} d}|tkr@| D ]}d| d}||7 }q&d}tjdkr| sj|tksj|sjtjj	r|
|t  }	||tdg }
g }t s|dd	g7 }|d
g7 }|s|dg7 }n:|dg7 }|r|	tjt g7 }	|rtjjd krt|
 |rLt rL|tkrLt| }d| d| d| d| dg}|rv|d kr`d}|tjjrpdnd7 }|rtjjd k	rt r|dg7 }n|ddg7 }|d7 }n$t r|dg7 }n|dddg7 }| }n|
|t  }	|r|	tjt g7 }	g }
tjdkrt  }tdd k	rtjtddd}tj|}|r|	tjtdd |
tjtdd n
t !d |p|}|rg ndg}|sBtd d k	rBt" }|rBtjtd d}|	tjtd d |
| t# j$d!krBtjtj|d"rBd#g}|st% \}}|r|	tj|d |
tj|d nt rdgnd
g}|rtjdkrt s|dd	g7 }tj&s|d$g7 }|
|j'g7 }
t rtjjd kr|	t()  |	t(*  |	t(+  tjjd k	rZ|	t(,  |	t(-  |	t(.  n*|	t(/  |	t(0  |	t(1  |	t(2  |	t(3  |	t(4  tjjd k	r|	t(5  n|	tjt(6 d |	d g }|r |r t r tjjd kr d%d&d'g}dd(d) |
D }d|d*d) |D  }|	||||fS )+Nr   cpp_extensionrG   z-D r8  r  LIBDIRrL   Z	torch_cpugompZtorch_pythonZompz-D CPU_CAPABILITY=z-D CPU_CAPABILITY_z-D HAVE_Z_CPU_DEFINITIONz -D USE_ROCMz -D USE_CUDAZamdhip64Zc10_hipZ	torch_hipz -D __HIP_PLATFORM_AMD__rN   Zc10_cudaZ
torch_cudar  Z
OMP_PREFIXrK  zomp.hr/  z-environment variable `OMP_PREFIX` is invalid.ZCONDA_PREFIXr  zlibiomp5.dylibZiomp5Zc10z-Wl,-Bstaticz-lcudart_staticz-Wl,-Bdynamicc                 S  s   g | ]}d | qS )-Lr9   r   pr9   r9   r=   r9  G  s     z1get_include_and_linking_paths.<locals>.<listcomp>c                 S  s   g | ]}d | qS )z-lr9   r`  r9   r9   r=   r9  H  s     )7rJ  torch.utilsr\  r-  r  rP   r  r   r  Zenable_kernel_profileinclude_pathsrO  library_pathsrM  get_config_varr<  rT   rU   r=  rv  rL   rM   rZ   rX  rB   upperrV   r  r  r  r   r  r   r   rD  unamer   rH  Zabi_compatibleZTORCH_LIB_PATHr2   ZsleefZopenmppythonZclang_includeZgcc_includeZgcc_install_tools_includeZ
cc_includeZlibgccZlibgcc_archZlibgcc_backwardglibcZlinux_kernelZrocmrN   )rY  r  rN   rZ  r\  macrosr   Z
macros_defr  ipathsrS  libscaprG  header_pathZ	valid_envZconda_lib_pathrF  Zstatic_link_libsZ
lpaths_strZlibs_strr9   r9   r=   get_include_and_linking_paths  s    
	


	











ro  zUnion[str, List[str]]zSequence[str])inputrC  r  r1  rY  r  rN   rZ  r/  use_absolute_pathuse_mmap_weightsextra_flagsr@   c              /   C  s  t ||||\}}}}}t| tr(| g} ddd |D }d}t r|r^|	s^| }|}t}n&dd | D }tj	|}tj	t}t
 st|d7 }|d7 }|d| 7 }d	t  }|d
t  7 }n| }|}d}|rd\}}d|}|
r|d7 }tdddt  d| dt|| dt| dt  dt  d| d| d| d| d| d| d| dt  dt  dt  dt  dt  dt| dd| d| d+ S )Nr8  c                 S  s   g | ]}d | qS z-Ir9   r`  r9   r9   r=   r9  _  s     z'cpp_compile_command.<locals>.<listcomp>rG   c                 S  s   g | ]}t j|qS r9   )rT   rU   r   )r   rT  r9   r9   r=   r9  h  s     z --rtlib=compiler-rtz -fuse-ld=lldz -Wl,--script=z-Bz -L)rG   rG   z -D USE_MMAP_SELFz[ \n]+z
            z
            -o z	
        )ro  r   rB   rV   r   r<  _LINKER_SCRIPTrT   rU   r   r  rc   r2   Z	glibc_librw  rx  r  r4  r5  r:  r6  r=  r;  r>  r@  rA  r0  r   )rp  rC  r  r1  rY  r  rN   rZ  r/  rq  rr  rs  rk  rS  rl  rj  r  Z
ipaths_strZclang_flagsZinp_nameZout_nameZlinker_scriptZlinker_pathsZinp_name_strr9   r9   r=   cpp_compile_commandL  s       


	
rv  )cmdc              
   C  sR   t | } zt|  W n4 tjk
rL } zt| |j|W 5 d }~X Y nX d S r8   )shlexrV  r  r  CalledProcessErrorr   CppCompileErrorrC  )rw  r   r9   r9   r=   run_command_and_check  s
    
r{  )rU   r@   c                 C  s"   |  drtj| S | dfS dS )zDReturns the path where the AOT Inductor compiled kernels are stored..sorG   N)endswithrT   rU   rV  )rU   r9   r9   r=   split_aot_inductor_output_path  s    
r~  c                   @  s^   e Zd ZU e Zded< eejZe	dddddddZ
e	dd	d
ddZe	dd ZdS )CudaKernelParamCachezDict[str, Dict[str, str]]rm   rB   zDict[str, str]rp   )r   paramsr   r@   c                 C  sL   t jjd krdnd}t|||ttjjd d\}}||t < || j	|< d S )Nr   r   r   )r   r   )
rL   rM   rZ   r   r~  r   r  r  r[   rm   )r  r   r  r   Zbin_typerI   rU   r9   r9   r=   rN    s    
	
zCudaKernelParamCache.setzOptional[Dict[str, str]]r_  c                 C  s   | j |d S r8   )rm   r   )r  r   r9   r9   r=   r     s    zCudaKernelParamCache.getc                 C  s
   | j  S r8   )rm   r   r  r9   r9   r=   get_keys  s    zCudaKernelParamCache.get_keysN)r   r   r   r  rm   r   r   r  cache_clearr  rN  r   r  r9   r9   r9   r=   r    s   

r  c                   @  s&   e Zd ZeddddddddZdS )	AotCodeCompilerr.   rB   r  r>   )r  ru  serialized_extern_kernel_nodesrN   r@   c                   s  t  }ttdd||jd}dd}t r\t |sRjrRt dd}qdt	 nddt
tjj\}td|d	\}	td
 tdfddfddd dddfdd}
dddfdd}ddlm} t }|tj||	d td}| t rZ|rZtjd d }t|d}|| W 5 Q R X |rhtjjntjd d }tjd d }tfdd j D }t  o|d!k}tjjrd}t|||jd||d"}td#| rt ||!  t"|d$ nt#| d%d&dd'd(d) t$fd*d j% D d+ fd,d j% D }|s|}d}n4t&t't()dt(*t(j+j,d-- }t./d.|d/ |}|
|d0t0j1 |}t||g|||j|d1}td2| rt ||g||!  t"|d3 nt#| |rtt|d4@}|2 }|d5d6|d6    || |t./d7| W 5 Q R X td84}|d9 |d:| d9 |d;| d9 W 5 Q R X W 5 Q R X |S )<NrT  o)r  rN   rZ  FTldobjcopyr  )r   r   zOutput code written to: %sZ
graph_dumpc                     s   dd dS )NZinductor_aot_coder  )rC   rM  filenamer9   r9   )r  r9   r=   r    s    z)AotCodeCompiler.compile.<locals>.<lambda>c                     s    S r8   r9   r9   ru  r9   r=   r    r  )rX  r   rB   )constsr@   c              	     s  t | dd\}}tj|d d } rl dtj| dtj| }t|||  t|d n d| d| }t| t	
d| jtj @ rt| d	krtd
d}nd}ttd @ dkrtdkstd d| dt d| d| 	}t	
d| t| d| }t	
d| t|  rVtddtj|}ntdd|}g }| d| d|  | d| d|  | d| d|  t	
dd| |D ]}t| q|S )Nr  r   r   .oz -r -b binary -o r8    zaot constant binary command: %s 5wzPModels with buffer mutation included doesn't support constants greater than 2GB!z .data=.ldataz1 .data=.lrodata,alloc,load,readonly,data,contentsr   @   zmust be power of 2 and >= 64z --rename-sectionz --set-section-alignment .data=z'aot constant rename section command: %szrm z$aot constant bin removal command: %sz[\W]rI   z --redefine-sym _binary_z#_start=_binary_constants_bin_start z!_size=_binary_constants_bin_size z_end=_binary_constants_bin_end z'aot constant binary redefine symbol: %s)r   rT   rU   splitextr   compile_filerV  chmodr{  r  rY  Zmutated_buffersrN  r|  r   r  
ValueErrorr#   rc   rw  rx  r  rV   )r  rI   consts_pathconsts_orw  Zrename_databodyZsymbol_list)fbcode_aot_cpu_rer  
ld_commandobjcopy_commandspecified_output_pathr9   r=   _compile_consts_linux  sf    
$

z6AotCodeCompiler.compile.<locals>._compile_consts_linuxc              	     sl  t jjr&t| d d\}}td| t| dk}d}|d7 }|d7 }|sr| D ]}|d| d	7 }qN| s|d
7 }n |d7 }|dt| d  d	7 }|d7 }|d7 }t|d d\}}tj	|d d }t
  d| d| }t| |rht|dl}	|	d |	d}
|
d}|dks$t|	| d}|t| k r^|	| |d  }||7 }q2W 5 Q R X |S )Nr  r  zbinary constants path: %s   z	.section	__DATA,__data
z%	.globl	__binary_constants_bin_start
z__binary_constants_bin_start:
z	.byte r  z
	.space 1
z	.quad 0x1234567899abcdef
z	.space r  z".globl	__binary_constants_bin_end
z__binary_constants_bin_end:
Sr   r   -c -o r8  zr+bs   ͫxV4r   )r   r  Zdebug_dump_consts_binr   r  rY  r  rT   rU   r  r  r{  rv   seekr(  findrc   )r  rI   Z_binary_constants_pathZis_large_constsZ
consts_asmr   r  r  rw  r   hdrZ	start_idxposrc)r  r9   r=   _compile_consts_darwin9  sP    






z7AotCodeCompiler.compile.<locals>._compile_consts_darwinr   r  r  r  z.jsonr   r|  r  c                 3  s<   | ]4\}}| j kr|jr(tjj|n
|  V  qd S r8   )folded_constantsr   rL   opsmkldnn_nbytesuntyped_storagenbytes)r   rC   Ztensorr  r9   r=   r   {  s
   
z*AotCodeCompiler.compile.<locals>.<genexpr>r  )rp  rC  r  rN   rZ  r/  rq  rr  zaot compilation command: %sr  ztorch.Tensorr>   )r   all_cudar@   c           	      S  s   dd }dd l }|  dkr dS | jrDtjj| }tjj| }n|  	 }| }|
 }||||j| }t|j}|r|S ||S )Nc                 S  s$   |  t| t d t t d}|S )Nr       )ljustr  r#   )	raw_bytesZpadded_bytesr9   r9   r=   _pad_to_alignment  s
    zEAotCodeCompiler.compile.<locals>._to_bytes.<locals>._pad_to_alignmentr   r  )ctypesnumelr   rL   r  r  data_ptrr  r  rD   r  r   ZPOINTERZc_ubyter   contents)	r   r  r  r  r  r  Zt_cpuZ	raw_arrayr  r9   r9   r=   	_to_bytes  s     	
z*AotCodeCompiler.compile.<locals>._to_bytesc                 3  s$   | ]}| j kr |jV  qd S r8   )r  get_original_value_of_constantZis_cudar   rC   r  r9   r=   r     s   
r  c                 3  s(   | ] }|j kr |V  qd S r8   )r  r  r  )r  r  r  r9   r=   r     s   
)r   Zqqr  )r  r  )rp  rC  r  rN   rZ  rq  zaot linkage command: %si  za+b    i @  qar  z// Compile cmd
// z// Link cmd
// )3r.  r   rv  rZ  r   r<  r2   r  Zobjcopy_fallbackr  r~  r  r  r   output_code_logr  r&   r  r  r   rT   rU   rV   r  r  rv   sumr|  r   Zforce_mmap_weightsr  rY  r  rV  r  r{  r   r   r   r  rL   randintZiinfoZint64maxitemstructpackrP   r  tell)r  r  ru  r  rN   Zpicked_vec_isaZcpp_commandrq  Zspecified_so_namer   r  r  r  r   r  Zoutput_jsonr   Z	output_soZoutput_oZconsts_sizerr  Zcompile_cmdZserialized_weightsZaot_constantsZmagic_numberr  Zlink_cmdZf_soZso_sizer9   )	r  r  r  r  r  r  r  ru  r  r=   compile  s    	






D/

 

&zAotCodeCompiler.compileN)r   r   r   r  r  r9   r9   r9   r=   r    s   r  c               	   C  s<   t tjd } |  }| }t|d\}}W 5 Q R X |S )Nr5  r  )r	   r>  r   rv   r(  r   )rU   r   r   rI   r  r9   r9   r=   rv    s    
rv  c                  C  s2   t  } t r"dtj|  dS d|  dS d S )Nrr  rq  )rv  r   r<  rT   rU   r   r  r9   r9   r=   
cpp_prefix	  s    r  )r  r  rw  r@   c              
   C  s  t | tr| gn| }dd |D }zt rt }tj|}tj|}tjt	d}t
 }	t|tj|	| tttj|	d t||D ]\}
}t|
tj|	| qtj|	d}t|| t||	|}tj|rt| t|| W 5 Q R X ntj|tjd W np tjk
r } zN|jd}d|kpRd|k}|rrtjd	krrd
}||7 }t|||W 5 d }~X Y nX d S )Nc                 S  s$   g | ]}t  rtj|n|qS r9   )r   r<  rT   rU   r   )r   ipr9   r9   r=   r9  	  s    z compile_file.<locals>.<listcomp>rK  r6  )r  r`   z'omp.h' file not foundrE  r  a  

OpenMP support not found. Please try one of the following solutions:
(1) Set the `CXX` environment variable to a compiler other than Apple clang++/g++ that has builtin OpenMP support;
(2) install OpenMP via conda: `conda install llvm-openmp`;
(3) install libomp via brew: `brew install libomp`;
(4) manually setup OpenMP and set the `OMP_PREFIX` environment variable to point to a path with `include/omp.h` under it.)r   rB   r   r<  rv  rT   rU   r   rV   _TORCH_PATHtempfileTemporaryDirectoryr  r   ru  zipcopytreer3   r   remover  r  r  ry  rC  r   rP   r  r   rz  )r  r  rw  Zinput_pathsZinput_filesrn  header_nameZoutput_nameZtorch_includes_pathZtmp_dirra  r   Zdest_include_pathZoutput_file_pathr   rC  Zopenmp_problemZinstructionr9   r9   r=   r  	  s<    


	r  zOptional[CDLL]_libgomp)r   c                   s    fdd  fdd|D }|  ds4t| d d }t| dD ]$\}}|dkr`t|}t||}qFt|st| d	 || }t|t	t
fr|D ]}t|tjst| d
 qtjj|S t|tjst| d tjj|S d S )Nc                   sN   t t| dkrtjj| S t| ttfrFt|  fdd| D S | S d S )Nz<class 'PyCapsule'>c                 3  s   | ]} |V  qd S r8   r9   r   r  convert_argr9   r=   r   K	  s     z9custom_op_wrapper.<locals>.convert_arg.<locals>.<genexpr>)	rB   rM  rL   r  _aotiZ&alloc_tensor_by_stealing_from_void_ptrr   r  r  )argr  r9   r=   r  F	  s
    z&custom_op_wrapper.<locals>.convert_argc                   s   g | ]} |qS r9   r9   r   r  r  r9   r=   r9  O	  s     z%custom_op_wrapper.<locals>.<listcomp>z
torch.ops.z, can not be called through custom_op_wrapperrF   r   z, can not be loaded through custom_op_wrapperz returns a list of non-tensorsz returns a non-tensor)rQ  rc   rP  rV  	importlibimport_moduler  callabler   r  r  rL   r  r  r  Z#unsafe_alloc_void_ptrs_from_tensorsZ!unsafe_alloc_void_ptr_from_tensor)r   r;   Zconverted_argsfuncrT  r   resultrr9   r  r=   custom_op_wrapperC	  s$    	
r  c                   @  s   e Zd ZU i Zded< eejZi Zded< edddddd	Z	e
ddddd
dZe
dddddZe
ddddddZdS )CppCodeCache0Dict[str, Callable[[], Union[CDLL, ModuleType]]]rm   r]   cpp_compile_command_flagsrB   zUnion[CDLL, ModuleType]rU   r   r@   c                 C  s
   t | S r8   )r   LoadLibrary)rU   r   r9   r9   r=   _load_library_innerk	  s    z CppCodeCache._load_library_innerc              
   C  s   z|  ||}||_|W S  ttfk
r } zvdt|krntjdrnt	da
|  ||}||_| W Y 8S dt|krt| dt  dt  d| W 5 d }~X Y nX d S )Nr^  z/usr/lib64/libgomp.so.1z(failed to map segment from shared objectz3.  The most common reason this may occur is if the zl folder is mounted with noexec (e.g., by default Docker mounts tmp file systems as noexec).  Please remount zi with exec enabled, or set another temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable.)r  r   r  r  rB   rT   rU   r   r   r  r  r  
gettempdir)r  rU   r   r  r   r9   r9   r=   _load_libraryo	  s"    
zCppCodeCache._load_libraryFNr9   r  c                   s$   j |t |d}t  ddlm}m} |dd|f |d}t| }	t|d|	d\}
 j	krdd	l
m} tjt d
 }|
d d d d d tt||
tf |
d| fdd}|d k	r||td tjs|W 5 Q R X | j	<  j	 S )N)rN   r  rs  r   r  CppTorchCudaOptionsr  rT  )rC   sourcesBuildOptionr  r  r  r  so)rp  rC  c                    sF   d krBd k	r    } | d ks*t d k	sBtS r8   )r  rc   r  )r  r  futurer   r/  r  Z	worker_fnr9   r=   load_fn	  s    z(CppCodeCache.load_async.<locals>.load_fnr  )r  r.  rJ  r  r  r  r   get_command_liner   rm   r  r  rT   rU   rV   r   r   r   _worker_compile_cpprv  r  r   )r  ru  rN   	submit_fnrs  Zcompile_commandr  r  dummy_builderZ	dummy_cmdr  r  	lock_pathr  r9   r  r=   
load_async	  sN       


zCppCodeCache.load_asyncr>   )ru  rN   c                 C  s   |  || S r8   )r  )r  ru  rN   r9   r9   r=   rw   	  s    zCppCodeCache.load)FNr9   )F)r   r   r   rm   r   r   r  r  r  r  r  r  r  rw   r9   r9   r9   r=   r  e	  s   

<r  c              	   C  sF   ddl m} || td$ tj|s8t||t| W 5 Q R X d S )Nr   r  r  )	r  r  r  rT   rU   r   r  rx  rV  )r  r  r  rw  r  r9   r9   r=   r  	  s    r  c                   @  s   e Zd ZU i Zded< eejZdddZdZ	dZ
dZed	Zed
d
ddddZeddd
ddddddZeddddZdS )CppPythonBindingsCodeCacher  rm   FTrY  r1  kernelzkernel(%s);Py_RETURN_NONE;rG   a  
        // Python bindings to call %s():
        #define PY_SSIZE_T_CLEAN
        #include <Python.h>
        #include <sstream>
        #include <cstdlib>

        #ifndef _MSC_VER
        #if __cplusplus < 202002L
        // C++20 earlier code
        // https://en.cppreference.com/w/cpp/language/attributes/likely
        #define likely(x)       __builtin_expect(!!(x), 1)
        #define unlikely(x)     __builtin_expect(!!(x), 0)
        #endif
        #endif

        // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow.
        // We manually link it below to workaround issues with fbcode build.
        static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj);

        template <typename T> static inline T parse_arg(PyObject* args, size_t n) {
            static_assert(std::is_pointer<T>::value, "arg type must be pointer or long");
            return static_cast<T>(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n)));
        }
        template <> inline long parse_arg<long>(PyObject* args, size_t n) {
            auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == -1 && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return result;
        }
        template <> inline uintptr_t parse_arg<uintptr_t>(PyObject* args, size_t n) {
            auto result = PyLong_AsVoidPtr(PyTuple_GET_ITEM(args, n));
            if(unlikely(result == reinterpret_cast<void*>(-1) && PyErr_Occurred()))
                throw std::runtime_error("expected int arg");
            return reinterpret_cast<uintptr_t>(result);
        }

        %s

        static PyObject* %s_py(PyObject* self, PyObject* args) {
            try {
                if(unlikely(!PyTuple_CheckExact(args)))
                    throw std::runtime_error("tuple args required");
                if(unlikely(PyTuple_GET_SIZE(args) != %s))
                    throw std::runtime_error("requires %s args");
                %s
            } catch(std::exception const& e) {
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return nullptr;
            } catch(...) {
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return nullptr;
            }
        }

        static PyMethodDef py_methods[] = {
            {"%s", %s_py, METH_VARARGS, ""},
            {NULL, NULL, 0, NULL}};

        static struct PyModuleDef py_module =
            {PyModuleDef_HEAD_INIT, "%s", NULL, -1, py_methods};

        PyMODINIT_FUNC PyInit_%s(void) {
            const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
            if(!str_addr) {
                PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
                return nullptr;
            }
            std::istringstream iss(str_addr);
            uintptr_t addr = 0;
            iss >> addr;
            _torchinductor_pyobject_tensor_data_ptr =
                reinterpret_cast<decltype(_torchinductor_pyobject_tensor_data_ptr)>(addr);
            return PyModule_Create(&py_module);
        }
        rB   r   r  c                 C  s   t tjjjjtjd< | d| j }zt	j
| W S  tk
rF   Y nX tj||}|d k	sbttj|}|t	j
|< |j| |S )NZ'_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTRrF   )rB   rL   r  Z_dynamor  Z'_torchinductor_pyobject_tensor_data_ptrrT   r  entry_functionrP   modulesKeyErrorr  utilspec_from_file_locationrc   module_from_specloaderexec_module)r  rU   r   module_namer0  r1  r9   r9   r=   r  ,
  s    


z.CppPythonBindingsCodeCache._load_library_innerr   Nr9   r  r>   r  r   )argtypesru  rN   num_outputsr@   c           
        s   d dd t|D } j j jr0 j| nd jt|t| j|  j j j jf
 } j|| |||dd fdd}	|	S )	a5  
        Wrap a C++ function in fast Python bindings.

        Args:
            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
            source_code: C++ source code containing a ENTRY_FUNCTION() function

        Returns:
            A python version of ENTRY_FUNCTION()
        , c                 s  s,   | ]$\}}d | dd d| dV  qdS )z
parse_arg<zconst rG   z>(args, )N)rO   )r   nZargtyper9   r9   r=   r   Q
  s   zBCppPythonBindingsCodeCache.load_pybinding_async.<locals>.<genexpr>rG   )r  rs  Nc                     s(   d kr t tstt jS r8   )r   r   rc   r  r  r9   r  Z
get_resultr  r9   r=   r  f
  s    z?CppPythonBindingsCodeCache.load_pybinding_async.<locals>.future)rV   rP  suffix_templater  extra_parse_argr  call_entry_functionr  )
r  r   ru  rN   r  r  rs  Z	parseargssuffixr  r9   r  r=   load_pybinding_async=
  s0    
   z/CppPythonBindingsCodeCache.load_pybinding_asyncr?   c                 O  s   | j || S r8   )r
  r  r;   r<   r9   r9   r=   load_pybindingo
  s    z)CppPythonBindingsCodeCache.load_pybinding)Fr   Nr9   )r   r   r   rm   r   r   r  r  r  r  r  r  textwrapdedentr  r  r  r
  r  r9   r9   r9   r=   r  	  s*   

O    1r  c                   @  s@   e Zd ZU i Zded< eejZdddZdZ	dZ
edZdS )	CppWrapperCodeCacher  rm   Tr  Zinductor_entry_cppzreturn inductor_entry_cpp(%s);a  
        #include <torch/csrc/inductor/aoti_torch/c/shim.h>

        static inline std::vector<AtenTensorHandle> unpack_tensor_handle_list(PyObject* pyvec) {
            std::vector<AtenTensorHandle> result;
            size_t result_len = PyList_GET_SIZE(pyvec);
            result.reserve(result_len);
            for (size_t i = 0; i < result_len; i++) {
                // AtenTensorHandle is essentially a pointer
                void* elem = PyCapsule_GetPointer(PyList_GET_ITEM(pyvec, i), NULL);
                result.push_back(reinterpret_cast<AtenTensorHandle>(elem));
            }
            return result;
        }

        static inline PyObject* pack_tensor_handle_list(const std::vector<AtenTensorHandle>& cppvec) {
            size_t result_len = cppvec.size();
            PyObject* result = PyList_New(static_cast<Py_ssize_t>(result_len));
            for (size_t i = 0; i < result_len; i++) {
                PyObject *elem =
                    cppvec[i] == nullptr
                        ? Py_None
                        // Store AtenTensorHandle as PyCapsulate
                        : PyCapsule_New(reinterpret_cast<void*>(cppvec[i]), NULL, NULL);
                PyList_SET_ITEM(result, i, elem);
            }
            return result;
        }

        template <> inline std::vector<AtenTensorHandle> parse_arg<std::vector<AtenTensorHandle>>(PyObject* args, size_t n) {
            return unpack_tensor_handle_list(PyTuple_GET_ITEM(args, n));
        }

        PyObject* inductor_entry_cpp(std::vector<AtenTensorHandle>&& input_handles) {
            // For outputs, we only allocate a vector to hold returned tensor handles,
            // not allocating the actual output tensor storage here
            std::vector<AtenTensorHandle> output_handles(%s);
            try {
                inductor_entry_impl(input_handles.data(), output_handles.data());
                return pack_tensor_handle_list(output_handles);
            } catch(std::exception const& e) {
                PyErr_SetString(PyExc_RuntimeError, e.what());
                return {};
            } catch(...) {
                PyErr_SetString(PyExc_RuntimeError, "unhandled error");
                return {};
            }
        }
        N)r   r   r   rm   r   r   r  r  r  r  r  r  r  r  r9   r9   r9   r=   r  t
  s   

r  new_cmdold_cmdc                   s`   fdd D } fddD }|s,|r\t d  t d t d| t d| tdd S )	Nc                   s   g | ]}| kr|qS r9   r9   r7  )r  r9   r=   r9  
  s      z6_temp_validate_new_and_old_command.<locals>.<listcomp>c                   s   g | ]}| kr|qS r9   r9   )r   y)r  r9   r=   r9  
  s      z!!! new_cmd: z!!! old_cmd: z!!! new_diff: z!!! old_diff: z'Error in new and old command different.)printrd   )r  r  Znew_diffZold_diffr9   r  r=   "_temp_validate_new_and_old_command
  s    



r  )rY  rN   r/  mmap_weightsrq  c                 C  s   t  }|j}tj o|}tj|d}tj|d}	dg}
|rPtj|d}	t	 }t
||	| ||d||||
d
d}dd	lm}m} ||| |||||
d
}|d|||d}| d}t|| |  d S )Nzdummy_input.cppzdummy_output.soz-D TEST_EXTRA_FLAGSzdummy_output.oF)
rp  rC  rY  r  rN   rZ  r/  rq  rr  rs  r8  r   r  )r  rY  rN   r/  rq  rr  rs  dummy_output)rC   r  r  r  )r  r  rC   rL   rN   Zis_availablerT   rU   rV   r.  rv  rV  r  r  r  r  r  cleanup)rY  rN   r/  r  rq  temp_dirZtest_dir_pathZ	test_cudar  r  rs  Z
picked_isar  r  r  Zdummy_build_optionr  r  r9   r9   r=   _do_validate_cpp_commands
  sT    

r  c            
      C  s   ddg} ddg}ddg}ddg}ddg}| D ]d}|D ]Z}|D ]P}|D ]F}|D ]<}	t d| d| d| d| d|	 
 t|||||	d qLqDq<q4q,d S )	NTFz	!!! cuda:z, use_mmap_weights:z, compile_only:z, include_pytorch:u   ， use_absolute_path:)rY  rN   r  r/  rq  )r  r  )
rN   rr  r/  rY  rq  r   r  zmr  r9   r9   r=   validate_new_cpp_commands
  s(     r  c                   @  s   e Zd ZU i Zded< eejZe	dZ
edd Zeeddd Zeedd	d
 Zedd Zeeddd Zeeddd ZeddddddZedd ZdS )HalideCodeCachez0Dict[str, Callable[[], Union[ModuleType, CDLL]]]rm   ah  
        #include "{halidebuffer_h}"
        #include "{headerfile}"
        #include <stdexcept>
        #include <cmath>
        void kernel({argdefs}) {{
            {buffers}
            int err = halide_kernel({buffer_names});
            if(err != 0) {{
                throw std::runtime_error("halide_kernel failed");
            }}
        }}
        c                 C  s   g }g }t |D ]j\}}|jr`|d|  |d|d  d|  d|j d|j d	 qd|jksnt||j q| jj| 	d|d
d	d
 |D d
| d
|d}|S )NZhl_buf_z    Halide::Runtime::Buffer r   (r  z);*zHalideBuffer.hc                 s  s"   | ]}|   d |j V  qdS )r8  N)bindings_typerC   r  r9   r9   r=   r   6  s     z0HalideCodeCache._codegen_glue.<locals>.<genexpr>r  )Zhalidebuffer_h
headerfileZargdefsbuffersbuffer_names)rP  r  r  Zhalide_typerC   ctyperc   glue_templateformatfind_headerrV   lstrip)r  r   r"  r#  r$  rT  r  Z	glue_coder9   r9   r=   _codegen_glue&  s$    (zHalideCodeCache._codegen_glueNc                 C  s(   t d| j|   tddgdS )Nr  IOr`   )r   rV   r&  cpu_cache_sizerv  ri   r  r9   r9   r=   config_hash<  s    zHalideCodeCache.config_hashc                  C  s|   zt d } W n tk
r&   Y dS X td| }|rJt|dd S td| }|rpt|dd d S tdd S )Nr#  i   zcache size\s*: (\d+) KBr   r  zcache size\s*: (\d+) MBz4failed to find 'cache size: ... KB' in /proc/cpuinfo)rv   r(  r  rw  r  r  r*  rd   )Zcpuinfor  r9   r9   r=   r-  I  s    zHalideCodeCache.cpu_cache_sizec           	   
   C  s   zt jjdj^}}t|D ]}|dr zt	dtj
||g}W n tjk
rd   Y q Y nX td|d}|r tj
tj
|d| }tj
|r tj
|  W S q W n, tk
r } zt||W 5 d }~X Y nX t|d S )Nhalider|  Zlddz(/.*)/libHalide.sor`   r   )r  	machinery
PathFinderr%  r+  rT   rg  r}  r  r  rU   rV   r  rw  r  r   abspathr*  r   ri  rd   )	r	  errmsgr  rI   fileoutr  rU   r   r9   r9   r=   _search_for_fileX  s(    


z HalideCodeCache._search_for_filec                 C  sV   d|    d}dtjkr>tjtjd |}tj|r>|S d| d}t||S )NZlibautoschedule_r|  
HALIDE_LIBCan't find z3, set env HALIDE_LIB to the directory containing it)r   rT   r  rU   rV   r   r  r6  )rC   ZsofilerU   r3  r9   r9   r=   find_libautoscheduleo  s    

z$HalideCodeCache.find_libautoschedulec                 C  s   dt jkr.t jt jd | }t j|r.|S dt jkrjt jt jt jd d|  }t j|rj|S d|  d}td|  |S )NZHALIDE_INCLUDEr7  z../include/r8  z7, set env HALIDE_INCLUDE to the directory containing it)rT   r  rU   rV   r   r2  r  r6  )rC   rU   r3  r9   r9   r=   r(  |  s    


zHalideCodeCache.find_headerr0   rB   )r   ru  c                   sZ  t tt|t|  |fddd }tj|dd d t|d }t|d }t|d }t|d	 }t|d
 }	tj	| }
g }|
rt
|| |ttjtj|ddd| ddddd| |jf|  | jdd |jD | |j||f|
r|jnd d |
rH|tt| tt|	|}|rB||jn|   fdd}|S )Nr  r/  ro  TrJ   zgenerate_kernel.pyzhalide_kernel.azhalide_kernel.hdoner  -gr  z-oz-fZhalide_kernelz-ez)static_library,h,schedule,pytorch_wrapperz-pc                 S  s   g | ]}|  qS r9   )r!  r  r9   r9   r=   r9    s     z9HalideCodeCache.generate_halide_async.<locals>.<listcomp>)rs  r  c                     s   r
    S r8   r9   r9   Zbindings_futureZwait_for_compiler9   r=   rw     s    z3HalideCodeCache.generate_halide_async.<locals>.load)r	   r   r   r   r.  rT   rW   rB   rU   r   r   r  r   r   r  r  rP   r   r9  Z	schedulerr;   r
  r   r*  touch_worker_task_halider  )r  r   ru  r  dirpathZgenfileZlibfiler"  ZdonefilelockfileZneed_compilejobstaskrw   r9   r<  r=   generate_halide_async  sp    	

z%HalideCodeCache.generate_halide_asyncc                 O  s   | j || S r8   )rC  r  r9   r9   r=   generate_halide  s    zHalideCodeCache.generate_halide)N)r   r   r   rm   r   r   r  r  r  r  r&  r  r*  r   r   r.  r-  r6  r9  r(  rC  rD  r9   r9   r9   r=   r    s2   



Ar  c              	   C  s6   ddl m} || t |D ]
}|  qW 5 Q R X d S )Nr   r  )r  r  r  )r@  rA  r  Zjobr9   r9   r=   r>    s    r>  c                 C  s   t | d  d S )Nr  )rv   closer  r9   r9   r=   r=    s    r=  c                   @  s   e Zd ZU e Zded< e Zded< eejZ	e
dddddd	d
Ze
dddddddddZe
dddddddddZe
edddddddZdS )ry  zDict[str, ModuleType]rm   z Dict[str, List[Tuple[Any, ...]]]linemapsrG   rB   r   )ru  r   r@   c                 C  s   t |d|dS NrH   r  r   )r  ru  r   r9   r9   r=   r     s    zPyCodeCache.writeNr  r   r   )ru  r   linemapattrsr@   c                 C  s"   t |d|d\}}| ||||S rG  )r   rz  )r  ru  r   rH  rI  r   rU   r9   r9   r=   rw     s    zPyCodeCache.load)r   rU   rH  rI  r@   c                 C  s   |d krg }|| j kr~t||}| j || tt| | j|< |d k	rf| D ]\}}t||| qP|s~|s~t	t
|||_| j | S r8   )rm   r    r   r  r  rF  r   setattrr   r   r!   Z_reload_in_subproc)r  r   rU   rH  rI  modr  r  r9   r9   r=   rz    s     

  zPyCodeCache.load_by_key_pathr  zOptional[List[Dict[str, Any]]])rU   linenor@   c                 C  s^   || j krd S | j | \}}t||}|dkr2d S ||d  }|sFd S ddddd}||S )Nr   r   rB   zList[Dict[str, Any]])stack_tracer@   c                 S  s"   d}t || }dd t|D S )Nz"File "(.+)", line (\d+), in (.+)\nc                 S  s"   g | ]\}}}|t ||d qS ))r  r)  rC   )r  )r   r   lr  r9   r9   r=   r9  (  s   zPPyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace.<locals>.<listcomp>)rw  findallreversed)rM  regexmatchesr9   r9   r=   parse_stack_trace#  s
    z<PyCodeCache.stack_frames_for_code.<locals>.parse_stack_trace)rF  r   )r  rU   rL  r  r  ra  entryrS  r9   r9   r=   stack_frames_for_code  s    


z!PyCodeCache.stack_frames_for_code)rG   )rG   NN)NN)r   r   r   r  rm   r   rF  r   r  r  r  r   rw   rz  r   r   rU  r9   r9   r9   r=   ry    s"   

   
  ry  c                   @  s"   e Zd ZeddddddZdS )TritonCodeCacherB   r   )kernel_nameru  r@   c                 C  s   t t||S r8   )r   ry  rw   )r  rW  ru  r9   r9   r=   rw   1  s    zTritonCodeCache.loadN)r   r   r   r  rw   r9   r9   r9   r=   rV  0  s   rV  r  c                   C  s   t tjjrtjjS t r2tjt	 ddS t t
drNt
ddS t t
dr|tjtjt
dddS dS )Nr  ZnvccZCUDACXXrG   rI  zbin/nvcc)r   Z
nvcc_existr   rN   Zcuda_cxxr<  rT   rU   rV   r2   r  realpathr9   r9   r9   r=   _cuda_compiler6  s    rY  c               	   C  s|   t  r ddlm}  | d}nt jj}tj	tj
|dtj	tj
|dtj	tj
|dtj	tj
|dgS )Nr   r:  zcutlass-3-headersrK  ztools/library/includeztools/library/srcztools/util/include)r   r<  r?  r;  Zget_dir_pathrN   Zcutlass_dirrT   rU   rX  rV   )r;  Zcutlass_pathr9   r9   r=   _cutlass_include_pathsB  s    rZ  c                  C  s   t   ddlm}  | jddtdg }g }t rxt| |D ] }|d| dd| g q@|	d	 |	d
 nt
d|S )Nr   r[  T)rN   r]  r_  z-Xlinkerz-rpath=z-lcudaz-lcudartzMUnsupported env, failed to find cuda libs! Currently only Linux is supported.)rJ  rb  r\  rd  rM  re  r%   rX  extendr  NotImplementedError)r\  rS  Zextra_ldflagsrU   r9   r9   r=   _cuda_lib_optionsR  s     

r]  c                   C  s   ddddgS )Nr2  z-fno-strict-aliasingz-fvisibility=hiddenz-Wconversionr9   r9   r9   r9   r=   _nvcc_host_compiler_optionsi  s
    r^  c               	   C  s   t  } | dkrd} d|  d|  g}tjjr@|d|  g7 }dddd	|  d
d| dtjjdddg}t r|dt	j
t g tjjr|dddg tjjr|dddddg tjjr|ddg |S )NZ90Z90aZsm_Zcompute_Zlto_z-t=0z"-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1z-wz-gencode=arch=compute_z,code=[,]r7  z--expt-relaxed-constexprz-DNDEBUGz-ccbinz	-lineinfor;  z-DCUTLASS_DEBUG_TRACE_LEVEL=1z--keepz,--ptxas-options=--warn-on-local-memory-usagez --ptxas-options=--warn-on-spillsz--resource-usagez--source-in-ptxz--use_fast_mathz -DCUTLASS_USE_TANH_FOR_SIGMOID=1)r   Zget_cuda_archr   rN   Zenable_cuda_ltorV   Zcompile_opt_levelr<  r[  rT   rU   r=  r2   r  Zenable_debug_infoZenable_ptxas_infoZuse_fast_math)archr   optionsr9   r9   r=   _nvcc_compiler_optionsr  sF    
	rc  Optional[List[str]])	src_filesdst_filedst_file_ext
extra_argsr@   c                 C  s  |d krg }t  }t }t }t }|| dd |D  dd |D  | }d| }	d}
|dkrt  dd| d| d|	 }
nr|dkr|d	 t  dd| d
| d|	 }
n<|dkrt  dd| d
| d|	 }
ntd| dt	d|
 |
S )Nc                 S  s(   g | ] }d |krd| nd| qS )r  z-Xcompiler z-Xcompiler=r9   )r   optr9   r9   r=   r9    s   z(cuda_compile_command.<locals>.<listcomp>c                 S  s   g | ]}d | qS rt  r9   )r   rU   r9   r9   r=   r9    s     r8  rG   r  r  r  z-sharedz -o exezUnsupported output file suffix !zCUDA command: %s)
rZ  r]  r^  rc  rV   rY  r  r\  r  rY  )re  rf  rg  rh  rc  Zcuda_lib_optionsZnvcc_host_compiler_optionsZnvcc_compiler_optionsrb  Zsrc_fileresr9   r9   r=   cuda_compile_command  s<    

$
$$rm  c                   @  sN   e Zd ZdZddddZdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dS )
DLLWrapperz A wrapper for a dynamic library.rB   )lib_pathc                 C  s"   || _ d| _t|| _d| _d S )NFT)ro  is_openr   r  DLL)rr   ro  r9   r9   r=   rs     s    zDLLWrapper.__init__c                 C  s   | j r|   d| _ d S rA   )rp  _dlcloserq   r9   r9   r=   rE    s    zDLLWrapper.closec                 C  sj   d }t  r6td }t|ds$td}t|dr>|j}ntd|d k	r\tg|_|| jj n
t	
d d S )Ndlclosezlibc.soz&Unsupported env, failed to do dlclose!zKdll unloading function was not found, library may not be unloaded properly!)r%   r   r   rs  r\  r   r   rq  Z_handler  r  )rr   Z	f_dlcloseZsymsr9   r9   r=   rr    s    

zDLLWrapper._dlclosec                   s2   | j std| j t| j|  fdd}|S )NzCannot use closed DLL library: c                    s     |  }|rt d j d S )NzError in function: )rd   r   )r;   errmethodr9   r=   _wrapped_func  s    z-DLLWrapper.__getattr__.<locals>._wrapped_func)rp  rd   ro  r  rq  )rr   rC   rw  r9   ru  r=   __getattr__  s
    zDLLWrapper.__getattr__c                 C  s   | S r8   r9   rq   r9   r9   r=   	__enter__  s    zDLLWrapper.__enter__c                 G  s   |    d S r8   rE  )rr   r;   r9   r9   r=   __exit__  s    zDLLWrapper.__exit__c                 C  s   |    d S r8   rz  rq   r9   r9   r=   __del__  s    zDLLWrapper.__del__N)r   r   r   r   rs   rE  rr  rx  ry  r{  r|  r9   r9   r9   r=   rn    s   	rn  c                   @  sx   e Zd ZU ejG dd dZe Zded< e	ej
ZdZedddd	ZeddddddZeddddZd
S )CUDACodeCachec                   @  s   e Zd ZU ded< ded< dS )zCUDACodeCache.CacheEntryrB   r  r  N)r   r   r   r   r9   r9   r9   r=   
CacheEntry  s   
r~  zDict[str, CacheEntry]rm   rE   r   r?   c                 C  s.   t tdgd|}t|| j|d\}}||fS )z
        Writes source code into a file with dst_file_ext as the file extension.
        Returns the hash key of source code, and the path to the file.
        Zdummy_inputr  r  )r   rm  r   _SOURCE_CODE_SUFFIX)r  ru  rg  Zcuda_commandr   r  r9   r9   r=   r   
  s      
zCUDACodeCache.writeNrd  r   )rh  r@   c                 C  sN  |  ||\}}|| jkr<ddlm} t }|tj||d td}| |dt	| j
  | }	tj|	st|g|	||}
t }td|
 |
d}ztj|tjtjd W n4 tjk
r } zt||j|W 5 d}~X Y nX t }d	||  d
|
 }t| ntd| t||	| j|< W 5 Q R X | j| j||fS )z
        Compiles CUDA source_code into a file with dst_file_ext extension.
        Returns a tuple of dst_file_path, hash_key, source_code_path
        r   r  r  r  NzCUDA Compilation: %sr8  r  zCUDA Compilation took z seconds. Compile command: z8CUDA Compilation skipped: %s since output already exists)r   rm   r  r  r   rT   rU   rV   r  r  r  r   rm  r
   r  rY  rV  r  r  r  r  ry  r   ZCUDACompileErrorrC  r  r}  r~  r  )r  ru  rg  rh  r   r  r  r   r  r  rw  r  Z	cmd_partsr  Zend_timeZlog_duration_msgr9   r9   r=   r    sD       
  
"zCUDACodeCache.compilezTuple[DLLWrapper, str, str]c                 C  s<   |dkrt d| d| | ||\}}}t|||fS )z
        Compiles source code and loads the generated .so file.
        Returns a tuple of DLLWrapper, hash_key, source_code_path
        r  zCOnly support loading a .so file for now. Requested file extension: z. Source code: )rd   r  rn  )r  ru  rg  Zdst_file_pathr   Zsource_code_pathr9   r9   r=   rw   B  s     
zCUDACodeCache.load)N)r   r   r   r   	dataclassr~  r  rm   r   r   r  r  r  r  r   r  rw   r9   r9   r9   r=   r}    s   

 (r}  c                   @  s   e Zd Zdd ZdS )CodeCacheFuturec                 C  s   t d S r8   )r\  rq   r9   r9   r=   r  U  s    zCodeCacheFuture.resultN)r   r   r   r  r9   r9   r9   r=   r  T  s   r  c                   @  s6   e Zd ZU ded< ddddddZdd	d
dZdS )TritonFuturer   r  r   zOptional[Future[Any]]rp   )r  r  r@   c                 C  s   || _ || _d S r8   )r  r  )rr   r  r  r9   r9   r=   rs   \  s    zTritonFuture.__init__r?   c                 C  s6   | j d k	r0| j  }|d ks td | _ | j  | jS r8   )r  r  rc   r  Z
precompile)rr   r  r9   r9   r=   r  e  s    


zTritonFuture.resultN)r   r   r   r   rs   r  r9   r9   r9   r=   r  Y  s   
	r  c                   @  s   e Zd Zdd Zdd ZdS )LambdaFuturec                 C  s
   || _ d S r8   	result_fn)rr   r  r9   r9   r=   rs   p  s    zLambdaFuture.__init__c                 C  s   |   S r8   r  rq   r9   r9   r=   r  s  s    zLambdaFuture.resultN)r   r   r   rs   r  r9   r9   r9   r=   r  o  s   r  )rG   )rG   )rG   r   )rG   r   rG   )F)r9   )T)TF)T)N)
__future__r   r   r  r   r   re   r  r   rg   loggingrT   rh  r"  r  rw  rx  r  r  r  rP   rM  r  r  r   r   bisectr   r   r  r   r   r   r   pathlibr	   r
   r   typesr   typingr   r   r   r   r   r   r   r   r   r   r   r   rL   Ztorch._dynamo.utilsr   r   Ztorch._inductorr   r   r   Ztorch._inductor.codegen.cudar   Z%torch._inductor.runtime.compile_tasksr   r    r!   Z%torch._inductor.runtime.runtime_utilsr"   Ztorch._inductor.utilsr#   r$   r%   Ztorch._loggingr&   Ztorch._subclasses.fake_tensorr'   r(   r)   Z%torch.fx.experimental.symbolic_shapesr*   r+   r,   concurrent.futuresr-   Ztorch._inductor.graphr.   Ztorch._inductor.irr/   Ztorch._inductor.runtime.hintsr0   rU   r2  r>  Z_HEREr=  r  rV   ru  r  r<  Z	triton.fbr2   Ztriton.fb.buildr3   Ztorch._inductor.fb.utilsr4   r5   r6   r7   rZ  ZgetArtifactLoggerr   r  r  	getLoggerr  rY   r[   r\   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   Picklerr   r*  r4  r   rA  rB  rC  ri  r   rD  r\  r]  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r!  r-  r(  r,  r.  r0  r4  r5  r6  r:  r;  r=  r>  r@  rA  rD  rH  rJ  rO  rX  ro  rv  r{  r~  r  r  rv  r  r  r  r   r  r  r  r  r  r  r  r  r  r>  r=  ry  rV  rY  rZ  r]  r^  rc  rm  rn  r}  r  r  r  r9   r9   r9   r=   <module>   s   8



F[  

I
	=  c:
m#	
 4(D  72"b	 #?: CN	. %>T