U
    zhb                    @   s
  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dlm
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlmZmZ d dlm  m  m Z! d dl"Z#d dl$Z#d dl%m&  m'Z( d dl)m*Z* d dl+m,Z, d d	l-m.Z. d d
l/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6m7Z7m8Z8 d dl9m:Z: d dl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB d dlCmDZDmEZEmFZF d dlGmHZH ddlImJZJmKZK ddlLmMZM ddlKmNZNmOZOmPZPmQZQ ddlRmSZS ddlTmUZU ddlVmWZW ddl&mXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZf ddlgmhZhmiZi er\ddljmkZk elemZnej
ejoddZoe#jhjpZpdd Zqdd  Zrd!d" Zsd#d$ Ztd%d& Zud'd d(dgZvd)d d'd(dgZwd*d+ Zxeey eey d,d-d.Zzdd0d1Z{d2d3 Z|d4d5 Z}d6d7 Z~d8d9 ZG d:d; d;ZejG d<d= d=eZd>d? ZG d@dA dAeZejG dBdC dCeZerdDerdEerdFerdGerdHerdIdJZddKdLZejG dMdN dNeZdOdP ZG dQdR dReZejG dSdT dTeZejG dUdV dVeZdWdX ZdYdZ Zdd\d]Zej
ed/d^Zd_d` ZejG dadb dbeZejG dcdd ddeZejG dedf dfeZG dgdh dheZejG didj djeZejG dkdl dleZejG dmdn dneZG dodp dpeZG dqdr dreZejG dsdt dteZejG dudv dveZdwdx Zdydz ZejG d{d| d|eZG d}d~ d~eZG dd deZG dd deZG dd deZG dd deZejG dd deZG dd deZG dd deZG dd deZG dd deZejG dd deZG dd deZG dd deZeeyeeeeeeyeeef  f ZG dd dZG dd deZG dd deZG dd deZG dd deZejG dd deZG dd deZG dd deZdd ZejG dd deZejG dd deZG dd deZG dd deZG dd deZedddZG dd deZG dd deZG dd deZG dd deZG dd deÃZG dd deZG ddÄ deZG ddń deZG ddǄ deZG ddɄ deZG dd˄ deZejG dd̈́ d̓ZepjjepjjepjjepjjepjjepjjepjjepjjepjjepjjepjjepjjepjjhZG ddτ deZejG ddф deރZejG ddӄ deZG ddՄ deZddddeey eey eey eyeeeey  dל	ddلZddddڜdd܄ZG ddބ deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZejG dd deZG ddք deZG dd deZejG dd deZdd ZejG dd deZejG d d deZG dd deރZejG dd deZG dd de#jjZG dd	 d	ZG d
d dZG dd deރZG dd deZdd Z dS (      N)nullcontext)partial)AnyCallableClassVarDictIterableListOptionalSequenceSetTupleTYPE_CHECKINGUnion)patch)ExprInteger)get_interface_for_device)identity)GraphModuleSerializer)can_auto_functionalize)metrics)compute_required_storage_lengthis_boolean_dtypeis_float_dtypemake_channels_last_strides_for
StrideType)get_schema_info)CallMethodKeycompute_unbacked_bindingsDivideByKeyfree_unbacked_symbolsrebind_unbackedresolve_unbacked_bindingsSymTypes)CleanDivFloorDivModularIndexing)SymT   )configdependencies)index_prevent_reordering)extract_free_unbacked_symbols#extract_input_node_reduction_rangesextract_read_writesvar_builder)OpCounterCSE)ReductionHint)do_bench)argsortcache_on_selfceildivconvert_shape_to_inductorconvert_shape_to_symintdeveloper_warningget_kernel_metadata
is_dynamicis_gpupad_listlike	sympy_dotsympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)opsV)GraphLoweringz  prefixc                    s    fdd  |  d S )Nc              	      s   | d kr
n|t | ttfr,| D ]} | qnZt | trN|  D ]} | q>n8t | tjjjt	t
ttjjjttfstdt|  dd S )NzFound zE, which is not a supported top level IR node. See [Note: Inductor IR])
isinstancelisttupledictvaluestorch	_inductorZir
ExpandViewDynamicScalarAssertScalar	TensorBoxsympylogicboolalgBooleanr   EffectfulKernelAssertionErrortype)nodesnode_check_tensorbox D/var/www/html/venv/lib/python3.8/site-packages/torch/_inductor/ir.pyr]      s*    
z%validate_ir.<locals>._check_tensorboxr^   )Znode_or_nodesr^   r\   r_   validate_ir   s    r`   c                    s   t  tst fdd}|S )Nc                     s   t t | |S N)getattrrC   argskwargsnamer^   r_   fn   s    zops_wrapper.<locals>.fn)rH   strrX   )rg   rh   r^   rf   r_   ops_wrapper   s    rj   c                    s&   t t| tt|   fdd}|S )Nc                    s0   t  t kst fddtt  D S )Nc                    s   g | ]} |  qS r^   r^   .0i)index	inv_orderr^   r_   
<listcomp>   s     z4inverse_reorder.<locals>.reindex.<locals>.<listcomp>lenrX   rangern   ro   rt   r_   reindex   s    z inverse_reorder.<locals>.reindex)rK   ziprs   rr   orderrv   r^   ru   r_   inverse_reorder   s    rz   c                    s    fdd}|S )Nc                    s0   t  t kst fddtt  D S )Nc                    s   g | ]} |  qS r^   r^   rk   )rn   ry   r^   r_   rp      s     z1same_reorder.<locals>.reindex.<locals>.<listcomp>rq   rt   ry   rt   r_   rv      s    zsame_reorder.<locals>.reindexr^   rx   r^   r{   r_   same_reorder   s    r|   c                    s    fdd}|S )Nc                    s    | S ra   r^   rt   reindex1reindex2r^   r_   rv      s    z fuse_reindexing.<locals>.reindexr^   )r~   r   rv   r^   r}   r_   fuse_reindexing   s    r            c                    s0   dd t | D   fddtt| D }|S )z
    Convert stride order to fill order
    For channel last format,

    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
    c                 S   s   i | ]\}}||qS r^   r^   rl   idxposr^   r^   r_   
<dictcomp>   s      z+stride_order2fill_order.<locals>.<dictcomp>c                    s   g | ]} | qS r^   r^   rk   lookupr^   r_   rp      s     z+stride_order2fill_order.<locals>.<listcomp>)	enumeraters   rr   )ry   
fill_orderr^   r   r_   stride_order2fill_order   s    r   )seqreturnc                 C   s<   t | }dd tt| D }t|D ]\}}|||< q&|S )z)
    Convert strides to stride order
    c                 S   s   g | ]}d qS r   r^   rl   _r^   r^   r_   rp      s     z$get_stride_order.<locals>.<listcomp>)r4   rs   rr   r   )r   Z
sorted_idxoutrm   elemr^   r^   r_   get_stride_order   s
    
r   Tc                    s   | d krd S |st jjj nt  fdd|  D }t| rX fdd|  jD }n
t	
|}|  }|  }t|}t|}tj||||d }|S )Nc                    s   g | ]} |qS r^   r^   rl   sZshape_fnr^   r_   rp      s     z%ir_node_to_tensor.<locals>.<listcomp>c                    s   g | ]} |qS r^   r^   r   r   r^   r_   rp      s     )sizestridedtypedevice)rD   graphsizevars	size_hintr   get_sizeis_storage_and_layout
get_layoutr   FlexibleLayoutcontiguous_strides	get_dtype
get_devicer8   rM   Zempty_stridedZzero_)xguard_shaper   r   r   r   tr^   r   r_   ir_node_to_tensor   s(    
   
r   c                 C   s   t | tr| sd gS | S ra   )rH   rI   valuer^   r^   r_   may_convert_to_optional   s    r   c                 C   s.   t | dd rt|  S t| tjr*| jS d S )Nr   )rb   get_device_typer   rH   rM   r   rY   r   r^   r^   r_   r     s
    r   c                 C   s   t t| S ra   )r<   r   r   r^   r^   r_   	is_triton
  s    r   c                 C   s   t | dkS Ncpu)r   r   r^   r^   r_   is_cpu  s    r   c                   @   s  e Zd ZU e Zeee  ed< e	e
jeejj dddZdd Zdd Zd	d
 Zdd Zdd Zedd Zdd Zdd Zdd Zdd Zdd Zdd Zd,ddZeg ejf ed < ej ed!< eg e!f ed"< eg ef ed#< eg ef ed$< eg ef ed%< eg e"f ed&< eg eegef f ed'< eg eegef f ed(< ee#gdf ed)< eg df ed*< eg ee$j% f ed+< dS )-IRNode_current_origins)originsc                 c   s(   t j}|| B t _z
d V  W 5 |t _X d S ra   )r   r   )r   oldr^   r^   r_   current_origins  s
    

zIRNode.current_originsc                 C   s$   t | j| _tjrt nd | _d S ra   )setr   r   r*   debug_ir_traceback	tracebackformat_stackselfr^   r^   r_   __post_init__  s    zIRNode.__post_init__c                 C   s   | j S ra   )r   r   r^   r^   r_   get_traceback#  s    zIRNode.get_tracebackc                 C   s6   dt | dd }t|dkr0|d d  d}|gS )Nzorigins=r    @   =   z...)rb   rr   )r   r   r^   r^   r_   common_repr&  s    zIRNode.common_reprc                 C   s6   ||    }tdtt|}t| j d| dS )Nz,
z(
z
))r   indentjoinmapri   rY   __name__r   linesr^   r^   r_   
str_helper-  s    zIRNode.str_helperc                 C   s   ||   kS ra   )get_read_namesr   rg   r^   r^   r_   
is_user_of2  s    zIRNode.is_user_ofc                 C   s   dd |   D S )Nc                 S   s   h | ]
}|j qS r^   rf   )rl   depr^   r^   r_   	<setcomp>7  s     z(IRNode.get_read_names.<locals>.<setcomp>)	get_readsr   r^   r^   r_   r   5  s    zIRNode.get_read_namesc                 C   s   | j S ra   r   r   r^   r^   r_   r   9  s    zIRNode.get_dtypec                 C   s   t dt|  dd S )Nz#get_layout() is not implemented by !NotImplementedErrorrY   r   r^   r^   r_   r   <  s    zIRNode.get_layoutc                 C   s   t dt|  dd S )Nz!get_size() is not implemented by r   r   r   r^   r^   r_   r   ?  s    zIRNode.get_sizec                 C   s   t |  S ra   )rA   r   r   r^   r^   r_   	get_numelB  s    zIRNode.get_numelc                 C   s   t jjt|  dS Nr   rD   r   r   is_expr_static_and_truerS   Eqr   r   r^   r^   r_   is_zero_elementsE  s    zIRNode.is_zero_elementsc                 C   s   t dt|  dS )a)  
        If the IRNode refers to data which has not been materialized (e.g.,
        it is a Pointwise/Reduction that could potentially have more
        compute fused into it), realize the IRNode into physical memory,
        ending the possibility of fusing into it, but allowing, e.g., multiple
        users to access the data without having to recompute.

        Check StorageBox.realize for a particularly notable implementation.

        TODO(ezyang): I think, in principle, every IRNode should have an
        implementation of this, and most of the time no-op is OK, but you
        really do have to audit each IRNode for this, so for now, raise
        an error if it's not implemented.  Note that some code in graph.py
        will catch this thrown error and suppress it with a warning.
        zrealize NYI on Nr   r   r^   r^   r_   realizeH  s    zIRNode.realizeNc                 C   s   t dt|  d S )Nzcodegen_reference NYI on r   r   writerr^   r^   r_   codegen_referenceZ  s    zIRNode.codegen_referencer   r   get_namer   
get_strideget_storage_numelhas_exceeded_max_readsmake_loadermake_indexer
mark_reuserealize_hintget_unbacked_symbol_uses)N)&r   
__module____qualname__r   r   r   r   r   __annotations__staticmethod
contextlibcontextmanagerrM   fxNoder   r   r   r   r   r   r5   r   r   r   r   r   r   r   r   r   r   r   ri   boolintrS   Symbolr^   r^   r^   r_   r     s<   



r   c                       s  e Zd ZU ejed< ejed< edef ed< e	e
 ed< eej dddZd.d
dZ fddZeZdd Zdd Zdd Zdd Zdd Zedd ZeejfddZedd Zdd Zd d! Z d"d# Z!d$d% Z"d&d' Z#d(d) Z$d*d+ Z%d,d- Z&  Z'S )/Loopsr   r   .inner_fnrangesr   c                 C   s"   t  jdd | jD |  f S )Nc                 s   s   | ]}t |V  qd S ra   r!   rl   er^   r^   r_   	<genexpr>w  s     z1Loops.get_unbacked_symbol_uses.<locals>.<genexpr>)r   unionr   inner_fn_free_unbacked_symbolsr   r^   r^   r_   r   u  s    zLoops.get_unbacked_symbol_usesr   c                    sF     d jj dt j  g fdd|D  d jg S )N'c                    s    g | ]}| d t  | qS =)rb   )rl   rg   r   r^   r_   rp     s     z!Loops.__str__.<locals>.<listcomp>origin_node=)r   r   rY   ri   r   inner_fn_strorigin_node)r   namesr^   r   r_   __str__{  s    zLoops.__str__c                    s   t    d | _d S ra   superr   r  r   	__class__r^   r_   r     s    
zLoops.__post_init__c                 C   s   | j S ra   r   r   r^   r^   r_   r     s    zLoops.get_devicec                 C   s   | j S ra   r  r   r^   r^   r_   get_origin_node  s    zLoops.get_origin_nodec                 C   s   | j S ra   r   r   r^   r^   r_   r     s    zLoops.get_sizec                 C   s   | j S ra   r   r   r^   r^   r_   get_pointwise_size  s    zLoops.get_pointwise_sizec                 C   s   dS NFr^   r   r^   r^   r_   	is_extern  s    zLoops.is_externc                 O   sJ   | dd }| dd }| ||}||_tjr:|p<t nd |_t|S )Nr  r   )popr  r*   r   r   r   rR   create)clsrd   re   r  tbrr^   r^   r_   r    s    
zLoops.createc                    s    fddt | D S )Nc                    s,   g | ]$\}}|d krt dnt |qS )r)   r   )rS   r   r@   )rl   nr   rF   r^   r_   rp     s   z Loops._index.<locals>.<listcomp>)r   )r   rG   r^   rF   r_   _index  s    
zLoops._indexc                 C   sh   t t }t|H ttdd. | j|    |j	W  5 Q R  W  5 Q R  S Q R X W 5 Q R X d S Nallow_indexingT)
r1   rD   ZMockHandlerset_ops_handlerr   objectr   r   inner_fn_argsZop_count)r   Z	opcounterr^   r^   r_   inner_fn_opcount  s      zLoops.inner_fn_opcountc                 C   s   |  | jfS ra   )r  r   r   r^   r^   r_   r    s    zLoops.inner_fn_argsc                 C   s   t jj| jf|   S ra   )rD   ZKernelFormatterHandlerZir_to_stringr   r  r   r^   r^   r_   r    s
    zLoops.inner_fn_strc                 C   s   |   tjkS ra   )r  r*   Zrealize_opcount_thresholdr   r^   r^   r_   has_large_inner_fn  s    zLoops.has_large_inner_fnc                 C   s   |  | j}t| j|S ra   )r  r   r-   r   )r   rn   r^   r^   r_   r     s    z$Loops.inner_fn_free_unbacked_symbolsc              
   C   sl   t tddT |  r>t|  |  |  jW  5 Q R  S t|  |  jW  5 Q R  S W 5 Q R X d S r  )	r   r  r   get_reduction_typer/   r   r   get_reduction_sizereadsr   r^   r^   r_   r     s    zLoops.get_readsc                 C   s   t dt|  dd S )Nz+get_reduction_size() is not implemented by r   r   r   r^   r^   r_   r    s    zLoops.get_reduction_sizec                 C   s   t dt|  dd S )Nz+get_reduction_type() is not implemented by r   r   r   r^   r^   r_   r    s    zLoops.get_reduction_typec                 C   s   t dt|  dd S )Nz+constant_to_device() is not implemented by r   r   r   r   r^   r^   r_   constant_to_device  s    zLoops.constant_to_device)r   )(r   r   r   rM   r   r   r   r   r   r	   r   r   rS   r   r   r  r   __repr__r   r  r   r  r  classmethodr  r   r(   INDEXr  r5   r  r  r  r  r   r   r  r  r"  __classcell__r^   r^   r  r_   r   n  s6   






	r   c                C   s&   |j rttd|S td|S d S )Nnanr   )is_floating_pointrC   constantfloat)r   r   r^   r^   r_   nop_loader_fn  s    r+  c                   @   s4   e Zd Zdd Zdd Zdd Zdd Zd	d
 ZdS )	Pointwisec                 C   s   |   rtt| jdS | jS )Nr   )r   r   r+  r   r   r   r^   r^   r_   r     s    zPointwise.make_loaderc                 C   s   g S ra   r^   r   r^   r^   r_   r    s    zPointwise.get_reduction_sizec                 C   s   d S ra   r^   r   r^   r^   r_   r    s    zPointwise.get_reduction_typec                 C   s   |   }t|||||S ra   )r   rC   storer   output_nameindexervarsloaderr^   r^   r_   store_output  s    zPointwise.store_outputc                 C   s,   |   }ttd||}t|| j|| jS FMove this to a given device. Requires that all reads are to constants.override_device)r   r   r  ConstantBufferr,  r   r   r   r   r2  r^   r^   r_   r"    s    zPointwise.constant_to_deviceN)r   r   r   r   r  r  r3  r"  r^   r^   r^   r_   r,    s
   r,  c                   @   sD   e Zd ZU eee gef ed< dZee	 ed< dd Z
dd ZdS )Scatteroutput_indexerNscatter_modec                 C   s4   |   }ttd||}t|| j|| j| j| jS r4  )	r   r   r  r7  r9  r   r   r:  r;  r8  r^   r^   r_   r"  	  s    zScatter.constant_to_devicec                 C   s*   |   }tj||| |||| jdS )N)mode)r   rC   r-  r:  r;  r.  r^   r^   r_   r3    s    zScatter.store_output)r   r   r   r   r	   r   r   r;  r
   ri   r"  r3  r^   r^   r^   r_   r9    s   
r9  
logical_ormaximumminimummuladdZbitwise_xor)anymaxminprodsumxor_sumc                    sP   t krt  }n:dkr, fdd}n dkr>dd }ntd |S )N   argminargmaxc                    s   | \}}|\}}dkr&t ||}nt ||}t ||}trt ||}t ||}	t |t ||	}t |t ||	} rt ||n
t ||}
t |t ||
}t |||t |||fS )NrI  )	rC   ltgteqr   ner=  logical_andwhere)abZa_valueZa_indexZb_valueZb_indexmaskequalZa_isnanZb_isnanZtiearg_break_ties_leftr   reduction_typer^   r_   
combine_fn/  s&    
z,get_reduction_combine_fn.<locals>.combine_fnwelford_combinec                 S   sR   | \}}}|\}}}|| }|| }	||	 }
|||
  || || | |
  |	fS ra   r^   )rQ  rR  Za_meanZa_m2Za_weightZb_meanZb_m2Zb_weightdeltaZ
new_weightZ	w2_over_wr^   r^   r_   rX  L  s    


zunknown reduction_type=)REDUCTION_COMBINE_FNr   )rW  r   rV  rX  r^   rU  r_   get_reduction_combine_fn*  s    

r\  c                       s  e Zd ZU ee ed< eed< ejed< e	ed< dd Z
dd Zeej d	 fd
dZdd Zdd Zdd Zdd Zdd Zdd Zdd Zed8ee dddZedd Zee	jdfejejejed e f ee ee ee	ee d!	d"d#Z!ed$d% Z"ed&d' Z#ee$e$e	e	d(d)d*Z%ed+d, Z&ed-d. Z'eejejejed e f ee ee ee ee ee$e	d/d0d1Z(eejejejed e f ee ee ee$e	d2	d3d4Z)eejejejed e f ee ee ee ee ee	d5
d6d7Z*  Z+S )9	Reductionreduction_rangesrW  	src_dtypereduction_hintc                 C   s   t j| ddS )N)r   r^  rW  )r  )r   r  r   r^   r^   r_   r  g  s     zReduction.__str__c                 C   s   |   S ra   )r  r   r^   r^   r_   r#  l  s    zReduction.__repr__r   c                    s"   t   t jdd | jD  B S )Nc                 s   s   | ]}t |V  qd S ra   r   r   r^   r^   r_   r   q  s     z5Reduction.get_unbacked_symbol_uses.<locals>.<genexpr>)r  r   r   r   r^  r   r  r^   r_   r   o  s    z"Reduction.get_unbacked_symbol_usesc                 C   s   | j S ra   )r^  r   r^   r^   r_   r  t  s    zReduction.get_reduction_sizec                 C   s   | j S ra   rW  r   r^   r^   r_   r  w  s    zReduction.get_reduction_typec              	   C   s0   t | j| j| j| ||}t ||||S ra   )rC   	reductionr   r_  rW  r   store_reduction)r   r/  r0  r1  reduction_varsr   r^   r^   r_   rc  z  s    
zReduction.store_reductionc                 C   s   t | jt | j S ra   )rr   r   r^  r   r^   r^   r_   index_length  s    zReduction.index_lengthc                 C   s$   |  | j}|  | jtj}||fS ra   )r  r   r^  r(   RINDEXr   rn   rindexr^   r^   r_   r    s    zReduction.inner_fn_argsc                 C   s*   |  | j}|  | jtj}t| j||S ra   )r  r   r^  r(   rf  r-   r   rg  r^   r^   r_   r     s    z(Reduction.inner_fn_free_unbacked_symbolsc              	   C   s<   |   }ttd||}t|| j|| j| j| j| j	t
jS r4  )r   r   r  r7  r]  r   r   r^  rW  r_  r2   DEFAULTr8  r^   r^   r_   r"    s    zReduction.constant_to_deviceN)
input_nodec	               	      s  dd }	t jj|}
t jjt|}tt| oP|dkoPtjoP|	|
oP|	|}|s`t	j
dfS tt| }|j| }t| dkr|jn|jddd      fd	d
} fdd}|dkr||
|}|dkr
t	j|fS |d k	r~t|tr~t|\}}|d k	r~|d k	r~t jjt|| }|
|kr~td||||| t	jdfS t	j|fS |
ks|d d krt	j
dfS t| ||||||t	j
}dd }||\}}|r||\}}t|dkrt	j
dfS t| | \\}}}d}d}|D ]V}t jj||}t jj||| }tdd |D }|rx|d7 }n|d7 }q,||krt	j||
|fS t	j ||
|fS d S )Nc                 S   s   t | ttjfS ra   rH   r   rS   r   r   r^   r^   r_   
_is_static  s    z(Reduction.num_splits.<locals>._is_staticrH  r)   Zxpu       i   c           	         s  d}d| }|d krdS | dkr(dS | | kr:}n| | k r d|  }|| d | }| ||  d ||   t | }t| fddd}t|  d	k rt|}q }n8t | }t|fd
dd}t| dk r|}n}| ||  d ||  S )N   rm  r   r)   i    c                    s   t |   S ra   absr   Ztmp_split_sizer^   r_   <lambda>      zFReduction.num_splits.<locals>.inner_reduction_splits.<locals>.<lambda>key   c                    s   t |   S ra   rp  r   max_elements_per_threadr^   r_   rs    rt  2   rS   divisorsrD  rq  rC  )	reduction_numel_hint
numel_hint	num_warpsnum_threads
split_sizetarget_blocksZblocks_per_outputr|  closestZmax_elements_per_devicery  Zmin_elements_per_deviceZmin_elements_per_threadZnum_smZthreads_per_smrr  r_   inner_reduction_splits  s6    

z4Reduction.num_splits.<locals>.inner_reduction_splitsc                    s  d}|d }d}d}|| d | }| | k r6}n| | k r | }|| d | }| ||  d ||   t | }	t|	 fddd}
t |
 d	k rt|
}q }n8t | }	t|	fd
dd}
t|
 dk r|
}n}| ||  d ||  S )Nro  rm  r      r)   c                    s   t |   S ra   rp  r   rr  r^   r_   rs    rt  zFReduction.num_splits.<locals>.outer_reduction_splits.<locals>.<lambda>ru     c                    s   t |   S ra   rp  r   rx  r^   r_   rs  
  rt  rz  r{  )r}  r~  r  r  Zrvals_per_threadZxvals_per_blockZxblocksr  r  r|  r  r  rr  r_   outer_reduction_splits  s4    

z4Reduction.num_splits.<locals>.outer_reduction_splitszUse previous IRNode's range and reduction_ranges instead of split. current ranges: %s, current reduction ranges: %s, current split: %d, new ranges: %s, new reduction ranges: %sr   c                    s   t d t|  |  |  d| d}| }dd |jD }g }d}t|jdd dD ]b t	 fd	d
|D rV|
 j  jtjjkrVtjj j }|jj}|  |jj|krVd}qV||fS )Nr   r   r   rg   layoutdatac                 S   s(   g | ] }t |tjrt |tjs|qS r^   )rH   rS   r   Numberrl   r  r^   r^   r_   rp   Q  s    zBReduction.num_splits.<locals>.get_read_indices.<locals>.<listcomp>Fc                 S   s   | j S ra   rf   r   r^   r^   r_   rs  X  rt  z@Reduction.num_splits.<locals>.get_read_indices.<locals>.<lambda>ru  c                 3   s   | ]}| j jkV  qd S ra   )rn   Zfree_symbolsr  mdr^   r_   r   Y  s     zAReduction.num_splits.<locals>.get_read_indices.<locals>.<genexpr>T)ComputedBufferr   r   r   r   get_read_writes
range_varssortedr   allappendrn   rg   rD   r   name_to_bufferr  r   decide_layout)r  cbread_writesr  indiceschangedbufZoriginal_strider^   r  r_   get_read_indicesC  s2    	z.Reduction.num_splits.<locals>.get_read_indicesr   c                 s   s   | ]}|d kV  qdS r)   Nr^   r   r^   r^   r_   r   s  s     z'Reduction.num_splits.<locals>.<genexpr>)!rD   r   r   Zsymbolic_hintrA   r<   r   r*   Zsplit_reductionsr2   ri  r   ZWorkerZget_device_propertiesZgpu_subslice_countZmulti_processor_countZINNERrH   rR   r.   logdebugr]  rr   r+   index_vars_squeezer   r  simplify_with_rangesstride_hintskeysr  OUTER) r   	dst_dtyper_  r   r   r^  rW  reduction_numelrj  rl  r}  r~  Zshould_splitZdevice_interfaceZdevice_propertiesr  r  split
new_rangesnew_reduction_rangesZextracted_numel_hintr  r  r  r  r   rd  Z	num_outerZ	num_innerrm   stridesouterr^   r  r_   
num_splits  s    	
$$








 
 

  zReduction.num_splitsc                    sn   dd D t ||  fdd|dkrbtddt fddfd	d
S S dS )z1Convert inner_fn from a reduction to an pointwisec                 S   s   g | ]}t jj|qS r^   )rD   r   r   Zevaluate_static_shaperl   r   r^   r^   r_   rp     s    z2Reduction._unroll_reduction_fn.<locals>.<listcomp>c                    s,   t  fddtjdd D  D S )Nc                 3   s   | ]} |V  qd S ra   r^   )rl   rh  )rn   value_fnr^   r_   r     s   z=Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<genexpr>c                 S   s   g | ]}t |qS r^   )rs   r  r^   r^   r_   rp     s     z>Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<listcomp>)	functoolsreduce	itertoolsproductrt   )rX  r^  r  rt   r_   rh     s    z*Reduction._unroll_reduction_fn.<locals>.fnrI  rJ  Nc                    s*   dd |D }| |t  |tjfS )Nc                 S   s   g | ]}t |qS r^   )rS   expandrk   r^   r^   r_   rp     s     zDReduction._unroll_reduction_fn.<locals>.value_fn.<locals>.<listcomp>)rC   
index_exprrM   int64rn   rh  )flatten_indexr   r^   r_   r    s    z0Reduction._unroll_reduction_fn.<locals>.value_fnc                    s    | d S Nr)   r^   rt   )rh   r^   r_   rs    rt  z0Reduction._unroll_reduction_fn.<locals>.<lambda>)r\  FixedLayoutr   r   r   )r   r^  rW  r_  r^   )rX  r  rh   r   r^  r  r_   _unroll_reduction_fn  s     
zReduction._unroll_reduction_fn.)	r   r  r_  r   r   r^  rW  r`  rj  c
                    s  t jjt}
|
dkr fdd}|d|d|d|dd ks^t d fdd}tj|||t	|d	S |
dkrd
kr fdd}nfdd}t| ||S t
|
tjrt jj|
tjk rt|dkrt| | ||S | | |||
|		\}}|tjkr<|}|dkr|	d k	sTtt|	\}}|d k	snt|d k	s|t| | |||||
S |dkr| | ||||	S tt| |||S )Nr   c                    s(    t jkrt| S  jr t| S t| S ra   )rM   r   r(  r*  r   valr  r^   r_   py_cnst  s    

z!Reduction.create.<locals>.py_cnstr)   )rF  rG  rE  rB  z* not supported for zero-dimension tensors!c                    s   t   S ra   rC   r)  rt   )r  rW  rtypes_to_initsr^   r_   const_fn  s    z"Reduction.create.<locals>.const_fnr   r   r   r   r  c                    s   t d S r   r  rt   r  r^   r_   rh     s    zReduction.create.<locals>.fnc                    s   dd D } | |S )Nc                 S   s   g | ]}t d qS r   rS   r   r   r^   r^   r_   rp     s     z0Reduction.create.<locals>.fn.<locals>.<listcomp>r^   rn   reduction_index)r   r^  r^   r_   rh     s    r  )rD   r   r   simplifyrA   r  rX   r,  r  rI   rH   rS   r   r   r*   Zunroll_reductions_thresholdr  r  r2   ri  r.   !create_multilayer_existing_rangescreate_multilayerrR   r]  )r  r   r  r_  r   r   r^  rW  r`  rj  r  r  r  rh   hintr  r  r  r^   )r  r   r^  rW  r  r_   r    s    
	


   


zReduction.createc                 C   sv   | dkr0t |rtdS t|r$dS t|jS | dkr`t |rHtdS t|rTdS t|jS ddddddd|  S )	N>   rJ  rC  z-infr   >   rI  rD  infr)   r   r   r   )rF  rE  rG  rB  welford_reducerY  )r   r*  r   rM   iinforD  rC  rW  r   r^   r^   r_   default_accumulator8  s*    zReduction.default_accumulatorc                 C   s   | dkrdS t | |S )Nr  r   )r]  r  r  r^   r^   r_   default_valueR  s    zReduction.default_value)r  r~  r`  r   c                 C   sP   | dkr|S | dkr,|dkr,|t jkr,t jS | dkrL|dkrL|t jkrLt jS |S )Nr  rn        )r2   r  Z
OUTER_TINY)r  r~  r`  r^   r^   r_   _multilayer_second_step_hintX  s    z&Reduction._multilayer_second_step_hintc                    sD   t |gtjjt| d  fdd}|S )Nr   c                    sj   |\}| ^ }| |   fdd}r`t t  tjt tj}t ||S | S d S )Nc                      s    gS ra   r^   r^   )r  r2  	new_indexrv   r^   r_   body}  s    zCReduction._multilayer_wrap_loader.<locals>.wrapper_fn.<locals>.body)rC   rK  r  rM   Zint32masked)rn   r  Zreduction_blockr  rS  
block_sizedefaultr2  	need_maskr  rv   )r  r  r_   
wrapper_fnx  s    
z5Reduction._multilayer_wrap_loader.<locals>.wrapper_fn)Viewdynamic_reshape_indexerrD   r   r   r   rS   r   )r  r2  r^  r  r  r  r  r  r^   r  r_   _multilayer_wrap_loaderi  s    
z!Reduction._multilayer_wrap_loaderc                    sL   t dd D s tdt|t|t|  fdd}|S )Nc                 s   s   | ]}|d kV  qdS r  r^   r  r^   r^   r_   r     s    zDReduction._multilayer_wrap_loader_existing_ranges.<locals>.<genexpr>z8Only enabled for numel_hint == 1, found original_ranges=c                    s:   | d t  }| t d  } |t|t| S ra   )rr   rJ   )Zmerged_indexZnew_reduction_indexZoriginal_idxr  r2  original_rangesrv   r^   r_   r    s    zEReduction._multilayer_wrap_loader_existing_ranges.<locals>.wrapper_fn)r  rX   r  r  rJ   )r  r2  r  original_reduction_rangesr  r  r  r  r^   r  r_   '_multilayer_wrap_loader_existing_ranges  s    

 z1Reduction._multilayer_wrap_loader_existing_ranges)r   r  r_  r  r  r  r  r  rW  r  r`  c                    s   |t jt jfkr|nt j}t|||||||	|}|  |   fdd}tj	j
t|}| |
||}||dt| ksttt|||||t|d |	||S )a
        Break a large reduction up into multiple smaller reductions
        recursively
        c                    s    | |S ra   r^   r  Zintermediate_loaderr^   r_   intermediate_fn  s    z;Reduction.create_multilayer_helper.<locals>.intermediate_fnN)rM   Zfloat16bfloat16r*  r]  r  r   r   rD   r   r   r   rA   r  rr   rX   rR   )r  r   r  r_  r  r  r  r  r  rW  r  r`  Zintermediate_dtypeZintermediater  r~  r^   r  r_   create_multilayer_helper  sH    
  z"Reduction.create_multilayer_helper)	r   r  r_  r   r   r^  rW  r  r`  c
                 C   s`   t |}
t|
|d  |}| ||}| |||
|||}| ||||||||f|g|||	S )r  r)   )rA   r&   r  r  r  )r  r   r  r_  r   r   r^  rW  r  r`  r  r  r  r  r^   r^   r_   r    s0         zReduction.create_multilayer)
r   r  r_  r   r  r  r  r  rW  r`  c                 C   sB   |  |	|}| ||||||}| ||||||||||	d|
S )r  r  )r  r  r  )r  r   r  r_  r   r  r  r  r  rW  r`  r  r  r^   r^   r_   r    s,    z+Reduction.create_multilayer_existing_ranges)N),r   r   r   r	   r   r   ri   rM   r   r2   r  r#  r   rS   r   r   r  r  rc  re  r  r   r"  r   r
   r   r  r  r$  ri  r   r   r   r  r  r  r   r  r  r  r  r  r  r&  r^   r^   r  r_   r]  _  s   

	
 	 b
'

 

  
!

=
&
r]  c                 C   s   d| krdS dS )Nwelfordr   r)   r^   ra  r^   r^   r_   num_reduction_outputs5  s    r  c                
       s   e Zd ZU eed<  fddZdd Zeej	fe
je
jeedef  ee ee eeddd	Zed
d Zee
je
jeedef  ee ee eeedddZ  ZS )WelfordReductionoutput_indexc	           
   
      sF   t  dkr d }	n fdd}	t |||	||||| || _d S )Nr)   r   c                    s   t  fddD S )Nc                 3   s   | ]}| V  qd S ra   r^   rl   rh   r   reduction_idxr^   r_   r   L  s     z<WelfordReduction.__init__.<locals>.loader.<locals>.<genexpr>)rJ   r  	inner_fnsr  r_   r2  K  s    z)WelfordReduction.__init__.<locals>.loader)rr   r  __init__r  )
r   r   r   r  r   r^  rW  r`  r  r2  r  r  r_   r  <  s    

zWelfordReduction.__init__c              	   C   s:   t | j| j| j| ||}|| j }t ||||S ra   )rC   rb  r   r_  rW  r   r  rc  )r   r/  r0  r1  rd  rL   r   r^   r^   r_   rc  Z  s    

z WelfordReduction.store_reduction.)r   r   r  r   r^  rW  r`  c              
      s4  dkst tjjt}fdd}	|dkrX|	d}
|	d}|	d}|
||fS |dkrfdd dkr d |	d|	dfS t fd	d
D S tjd |d\}}t	j
kr||dkr| |S fddtdD }|D ]}|  q |S )N>   rY  r  c                    s$    fdd}t j|tdS )Nc                    s   t  S ra   r  r   )r   r  r^   r_   r   t  s    z8WelfordReduction.create.<locals>.const.<locals>.inner_fnr  r,  r  rI   )r  r   )r   r   r   r  r_   consts  s    z&WelfordReduction.create.<locals>.constr   r)   c                    s$    fdd}t j|tdS )Nc                    s   dd D } | |S )Nc                 S   s   g | ]}t d qS r   r  r   r^   r^   r_   rp     s     zKWelfordReduction.create.<locals>.copy.<locals>.inner_fn.<locals>.<listcomp>r^   )r   r  )r2  r^  r^   r_   r     s    z7WelfordReduction.create.<locals>.copy.<locals>.inner_fnr  r  )r2  r   )r   r   r   r^  r2  r_   copy  s    z%WelfordReduction.create.<locals>.copyr  c                 3   s   | ]} |V  qd S ra   r^   r  )r  r^   r_   r     s     z*WelfordReduction.create.<locals>.<genexpr>)rW  r  c                    s(   g | ] }t t |qS r^   )rR   r  r  )rl   Z
output_idx)r   r   r  r   r`  r^  rW  r^   r_   rp     s   z+WelfordReduction.create.<locals>.<listcomp>r   )rX   rD   r   r   r  rA   rJ   r]  r  r2   ri  r  rs   r   )r  r   r   r  r   r^  rW  r`  r  r  Zmeanm2weightr  r  resultsr   r^   )r  r   r   r  r   r`  r^  rW  r_   r  d  sT    


zWelfordReduction.createc                 C   s   dS )Nr  r^   r  r^   r^   r_   r    s    zWelfordReduction.default_valuer   r   r  r   r^  rW  r  r`  c	              
      s,  t tjjt d }	|	rp|dkrpfdd}
j||d t|
ddt|
ddf|d|dS t	d   t
|t fdd	|D |f g||}|D ]}|  qd
d |D }dd tjjt |}||}t
|tfdd	|D |gd|S )r  r   rY  c                    s   t | S ra   r  )r   r  r   r   r^   r_   r)    s    z4WelfordReduction.create_multilayer.<locals>.constantr   r)   r  c              	   3   s$   | ]}j | d dV  qdS )r   )r  N)r  )rl   r2  )r  r  r  r^  r  r^   r_   r     s   	z5WelfordReduction.create_multilayer.<locals>.<genexpr>c                 S   s   g | ]}|  qS r^   )r   rk   r^   r^   r_   rp      s     z6WelfordReduction.create_multilayer.<locals>.<listcomp>c                 S   s   || |S ra   r^   )rn   r  r2  r^   r^   r_   intermediate_loader_fn"  s    zBWelfordReduction.create_multilayer.<locals>.intermediate_loader_fnc                 3   s   | ]}t  | d V  qdS )r  N)r   r   rk   )r  r^   r_   r   ,  s   )rA   rD   r   r   r   rS   r   r  r   r&   r  r  rJ   r   r   r  )r  r   r   r  r   r^  rW  r  r`  r  r)  Zintermediatesrm   Z	i_loadersr~  r^   )r  r  r   r  r  r^  r  r_   r    sh    

	
  z"WelfordReduction.create_multilayer)r   r   r   r   r   r  rc  r$  r2   ri  rM   r   r   r   r   r   r	   r   ri   r  r   r  r  r&  r^   r^   r  r_   r  9  s4   

	u
r  c                       s  e Zd ZU ee ed< ee ed< eeedf eedf geedf f ed< eee ee gee f ed< e	ed< e
ed< eejdf ed< eedef df ed	< eej d
 fddZ fddZdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zee	jfejeejdf eeee gef df ee e
eeedf eedf geedf f e	eed  d d!d"Zeejejeee gef e
ee ee eeedf eedf geedf f ed#d$d%Z  Z S )&Scanscan_rangesr   .rX  rv   r`  r  dtypesr  r   c                    s:   t   t jdd | jD  B t jdd | jD  B S )Nc                 s   s   | ]}t |V  qd S ra   r   r   r^   r^   r_   r   L  s     z0Scan.get_unbacked_symbol_uses.<locals>.<genexpr>c                 s   s   | ]}t |V  qd S ra   r   r   r^   r^   r_   r   M  s     )r  r   r   r   r  r   r   r  r^   r_   r   F  s    zScan.get_unbacked_symbol_usesc                    s0   t | jt | j t | jks"tt   d S ra   )rr   r   r  r   rX   r  r   r   r  r^   r_   r   P  s    "zScan.__post_init__c                    sJ   |  ||  fdd| jD }t| j| j|}t|| || j S )Nc                    s   g | ]}| qS r^   r^   )rl   r   r  r^   r_   rp   V  s     z(Scan.store_reduction.<locals>.<listcomp>)rv   r  rC   scanr  rX  r-  r  )r   r/  r0  r1  Z	scan_varsrL   resultr^   r  r_   rc  T  s    zScan.store_reductionc                 C   s   dS )NZcustomr^   r   r^   r^   r_   r  Z  s    zScan.get_reduction_typec                 C   s   | j S ra   )r  r   r^   r^   r_   r  ^  s    zScan.get_reduction_sizec                 C   s   | j S ra   r   r   r^   r^   r_   r   a  s    zScan.get_sizec                 C   s   | j S ra   r   r   r^   r^   r_   r  d  s    zScan.get_pointwise_sizec                 C   s   t | jt | j S ra   )rr   r   r  r   r^   r^   r_   re  g  s    zScan.index_lengthc                 C   s.   |  | j}|  | jtj}| ||}|fS ra   )r  r   r  r(   rf  rv   r   rn   rh  r   r^   r^   r_   r  j  s    zScan.inner_fn_argsc                 C   s4   |  | j}|  | jtj}| ||}t| j|S ra   )r  r   r  r(   rf  rv   r-   r   r  r^   r^   r_   r   p  s    z#Scan.inner_fn_free_unbacked_symbolsrR   )r   r  r  r   axisrX  r`  r   c                    s  d    d d    g	t js>d gt S tjjd k	rdtdkrdd gt S tjj}	|		t
	}
ttkst|	t|
drfddttD S | jd d  	|
d\}|dkrtnt
|dkrtjjd k	rd gt S |dkrBtdkrBd gt S  	fdd	
fddttD }|D ]}|  q|S )	Nr)   c                    s&   g | ]}t j | | d qS )r  )r,  r  rl   r  )r   r  r  r   r^   r_   rp     s   zScan.create.<locals>.<listcomp>r   r   r   r   r  pointwise_rangesr  rX  
scan_numelc                    sB   t |t kstt | t ks(t| d   ||  d  S ra   )rr   rX   )rn   Z
scan_index)r  r
  r  r^   r_   rv     s    zScan.create.<locals>.reindexc                    sB   g | ]:}t 	f | | 
 |d qS ))r   r   r  r   r  r   r   r  rX  rv   r`  r  )rR   r  r  )rX  r   r  r  re   r
  r`  rv   r  	scan_typer   r^   r_   rp     s&   )r<   rY   rr   rM   versionZhiprD   r   r   r  rA   rX   r   rS   ZLers   r  r   	SplitScanr   )r  r   r  r  r   r  rX  r`  re   r   r  r  r  r  r^   )r  rX  r   r  r  re   r
  r`  rv   r  r  r   r_   r  v  sF    






zScan.creater	  c	           
   
      s(    fdd}	t j||||	||d|dS )Nc                    s   | d   ||  d  S ra   r^   r  r  r   r^   r_   r    s    z#Scan.num_splits.<locals>.wrapper_fnrF  )r   r  r_  r   r   r^  rW  r  )r]  r  )
r  r   r   r   r  r
  r  rX  r  r  r^   r  r_   r    s    zScan.num_splits)!r   r   r   r	   r   r   r   r   r   r2   r   rM   r   r   rS   r   r   r   rc  r  r  r   r  re  r  r   r$  ri  r   r
   r  r  r&  r^   r^   r  r_   r   8  sP   
, 
	&
Z&r   c                   @   s   e Zd ZdS )r  Nr   r   r   r^   r^   r^   r_   r    s   r  c                 C   s.   zt | dd W dS  tk
r(   Y dS X d S )NFfreezeT)as_storage_and_layoutr   r   r^   r^   r_   r     s
    r   c                 C   sF   z*t | dd\}}| r"|  | W S  tk
r@   Y dS X d S NFr  )r  should_pad_stridespad_stridesis_contiguousr   )r   bufferr  r^   r^   r_    is_contiguous_storage_and_layout  s    
r  Fc                 C   s   t | trt| j||||dS t | trt | jtr|r||rX| j  | jj s|t	n$|dk	rr| jj
||d n
| j  | | jjfS t | trt| j|d\}}|| jfS tdS )z
    Try to simplify x into a StorageBox and a Layout.

    allow_padding only affect how we apply stride_order. When allow_padding
    is True, we have the freedom to add padding when applying the stride_order.
    r  want_contiguousstride_orderallow_paddingNr  r  )rH   rR   r  r  
StorageBoxBufferfreeze_layoutr  r  rX   freeze_layout_with_stride_orderr  ReinterpretViewr   )r   r  r  r  r  r  r   r^   r^   r_   r    s6    	

 



r  )r  c                 C   s8   zt | dd\}}||W S  tk
r2   Y dS X d S r  )r  is_stride_orderedr   )r   r  r  r  r^   r^   r_   "is_stride_order_storage_and_layout4  s
    r%  c                   @   s   e Zd ZU eed< dd Zdd Zdd Zdd	 Ze	d
d Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*S )+BaseViewr  c                 C   s
   | j  S ra   r  r   r   r^   r^   r_   r   @  s    z!BaseView.get_unbacked_symbol_usesc                 C   s   t d|  d S )Nzmake_reindexer NYI on r   r   r^   r^   r_   make_reindexerC  s    zBaseView.make_reindexerc                    s$   | j   |   fdd}|S )Nc                    s    | S ra   r^   r  innerrv   r^   r_   r0  J  s    z&BaseView.make_indexer.<locals>.indexer)r  r   r)  r   r0  r^   r*  r_   r   F  s    
zBaseView.make_indexerc                    s$   | j   |   fdd}|S )Nc                    s    | S ra   r^   r  r*  r^   r_   r2  S  s    z$BaseView.make_loader.<locals>.loader)r  r   r)  r   r2  r^   r*  r_   r   O  s    
zBaseView.make_loaderc                 C   s   | j jS ra   r  r   r   r^   r^   r_   r   X  s    zBaseView.dtypec                 C   s
   | j  S ra   r  r   r   r^   r^   r_   r   \  s    zBaseView.get_layoutc                 C   s
   | j  S ra   )r  r   r   r^   r^   r_   r   _  s    zBaseView.get_devicec                 C   s   d S ra   r^   r   r^   r^   r_   r  b  s    zBaseView.get_origin_nodec                 C   s
   | j  S ra   r  r   r   r^   r^   r_   r   e  s    zBaseView.get_namec                 C   s   |   S ra   r   r   r^   r^   r_   r  h  s    zBaseView.get_pointwise_sizec                 C   s   | j |S ra   )r  r   r   usersr^   r^   r_   r   k  s    zBaseView.mark_reusec                 C   s
   | j  S ra   )r  r   r   r^   r^   r_   r   n  s    zBaseView.has_exceeded_max_readsc                 C   s
   | j  S ra   r  r   r   r^   r^   r_   r   q  s    zBaseView.realizec                 C   s
   | j  S ra   )r  r   r   r^   r^   r_   r   t  s    zBaseView.realize_hintc                 C   s
   | j  S ra   )r  r   r   r^   r^   r_   r   w  s    zBaseView.get_storage_numelc                 C   s
   | j  S ra   )r  r  r   r^   r^   r_   r  z  s    zBaseView.is_externc                 C   s
   | j  S ra   )r  is_module_bufferr   r^   r^   r_   r5  }  s    zBaseView.is_module_bufferc              
   C   s:   t tdd" t|  |  jW  5 Q R  S Q R X d S r  )r   r  r   r/   r   r   r   r   r^   r^   r_   r     s
    zBaseView.get_readsc                 C   s   | }t |tr|j}q|S ra   )rH   r&  r  r   r   r^   r^   r_   unwrap_view  s    
zBaseView.unwrap_viewc                 C   s0   |   }ttd||}t||  ||  S r4  )r   r   r  r7  r,  r   r   r8  r^   r^   r_   r"    s    zBaseView.constant_to_deviceN)r   r   r   r   r   r   r)  r   r   propertyr   r   r   r  r   r  r   r   r   r   r   r  r5  r   r7  r"  r^   r^   r^   r_   r&  <  s,   
		
r&  c                   @   sB   e Zd ZU ee ed< edd Zedd Z	dd Z
dd	 Zd
S )rO   r   c                 C   s   t jj}tttj|}|  }dgt|t|  t| }t|t|ksRt	t
t|D ]n}|| dkr|| dk	s~t	|| ||< q^|| dks|| dkrq^|j|| ||  dddks^t	dq^|S )zReplace `-1` with correct sizesNr  r)   r   fallbackzKBroadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i})rD   r   r   rI   r   rS   r  r   rr   rX   rs   r   )r   new_sizer   old_sizerm   r^   r^   r_   _normalize_size  s     zExpandView._normalize_sizec           
      C   s   |  ||}t|rt|\}}t|t|j }|dks>ttdg| }t|j	|jD ]$\}}|
|dkrt|ntd q\t|j|jt|||j}	t||	S t||S Nr   r)   )r=  r   r  rr   r   rX   rS   r   rw   r   r  r  r   r   rI   offsetr#  rO   )
r  r   r;  storage
old_layoutskip
new_strider   r   
new_layoutr^   r^   r_   r    s"    
zExpandView.createc                 C   s   | j S ra   r  r   r^   r^   r_   r     s    zExpandView.get_sizec                    s4   |   }| j   t|t   fdd}|S )Nc                    sT   t | d  } t| t ks$ttt D ]} | dkr0td| |< q0| S Nr)   r   )rI   rr   rX   rs   rS   r   )rn   rm   actualrB  r^   r_   rv     s    z*ExpandView.make_reindexer.<locals>.reindex)r   r  rr   )r   targetrv   r^   rF  r_   r)    s
    
	zExpandView.make_reindexerN)r   r   r   r	   r   r   r   r=  r$  r  r   r)  r^   r^   r^   r_   rO     s   


rO   c                   @   sB   e Zd ZU ee ed< edd Zedd Zdd Z	dd	 Z
d
S )PermuteViewdimsc                    s   |  |}t|ttt|ks&tt|rvt|\} t j j	 fdd|D  fdd|D  j
}t||S t||S )Nc                    s   g | ]} j | qS r^   r  rk   rA  r^   r_   rp     s     z&PermuteView.create.<locals>.<listcomp>c                    s   g | ]} j | qS r^   r   rk   rK  r^   r_   rp     s     )_map_neg_dimsr   rs   rr   rX   r   r  r  r   r   r?  r#  rI  )r  r   rJ  r@  rD  r^   rK  r_   r    s    

zPermuteView.createc                    s    fdd D S )Nc                    s$   g | ]}|d kr|n
t  | qS r   rr   )rl   dimrJ  r^   r_   rp     s     z-PermuteView._map_neg_dims.<locals>.<listcomp>r^   )r  rJ  r^   rP  r_   rM    s    zPermuteView._map_neg_dimsc                    sD   t | | jt tt| jks&t| j   fdd| jD S )Nc                    s   g | ]} | qS r^   r^   rk   r  r^   r_   rp     s     z(PermuteView.get_size.<locals>.<listcomp>)r   rM  rJ  rs   rr   rX   r  r   r   r^   r  r_   r     s    &
zPermuteView.get_sizec                    s^   dd t | jD   fddtt| jD  t ttt| jksNt fdd}|S )Nc                 S   s   i | ]\}}||qS r^   r^   )rl   rm   jr^   r^   r_   r     s      z.PermuteView.make_reindexer.<locals>.<dictcomp>c                    s   g | ]} | qS r^   r^   rk   invr^   r_   rp     s     z.PermuteView.make_reindexer.<locals>.<listcomp>c                    s    fddD S )Nc                    s   g | ]} | qS r^   r^   rk   rt   r^   r_   rp      s     z?PermuteView.make_reindexer.<locals>.reindex.<locals>.<listcomp>r^   rt   rR  rt   r_   rv     s    z+PermuteView.make_reindexer.<locals>.reindex)r   rJ  rs   rr   r   rX   )r   rv   r^   rR  r_   r)    s
    zPermuteView.make_reindexerN)r   r   r   r	   r   r   r$  r  rM  r   r)  r^   r^   r^   r_   rI    s   


rI  c                   @   sB   e Zd ZeddddZeeejdf dddZ	d	d
 Z
dS )SqueezeViewNrO  c                   sD  t |rt|\}}g }g } d k	rPt ts6tdd krL t|jk sPttt|j|j	D ]`\}\}}	 d kr|dkr|
| |
|	 qb| kr|
| |
|	 qb|dksbtdqbt|j|j|||j}
t||
S  d krt|dd | D S |   dkstt| fddt| D S d S )Nzexpected integer dim argumentr   r)   zexpected squeezed size to be 1c                 S   s   g | ]}|d kr|qS r)   r^   r   r^   r^   r_   rp   '  s      z&SqueezeView.create.<locals>.<listcomp>c                    s   g | ]\}}| kr|qS r^   r^   rl   rm   r   rU  r^   r_   rp   *  s      )r   r  rH   r   rX   rr   r   r   rw   r   r  r  r   r   r?  r#  r  r  r   )r  r   rO  r@  rA  r;  rC  rm   r   r   rD  r^   rU  r_   r    s8    



zSqueezeView.create.r  c                    sV   dd | D }dd t | D t|  ttj ttjdf d fdd}||fS )Nc                 S   s   g | ]}|d kr|qS rV  r^   r   r^   r^   r_   rp   .  s      z(SqueezeView.squeezer.<locals>.<listcomp>c                 S   s   g | ]\}}|d kr|qS rV  r^   rW  r^   r^   r_   rp   /  s      .)rn   r   c                    sV   t | t ks"t|  d tdg  }t| D ]\}}|||< q<t|S )N r   )rr   rX   rS   r   rw   rJ   )rn   r  r   r   lengthZnot_oner^   r_   rv   2  s
    "
z%SqueezeView.squeezer.<locals>.reindex)r   rr   r	   rS   r   r   )r   r;  rv   r^   rY  r_   squeezer,  s
    &zSqueezeView.squeezerc                 C   s   t dd S )Nzuse SqueezeView.create())rX   )r   r  r^   r^   r_   r  ;  s    zSqueezeView.__init__)r   r   r   r$  r  r   r   rS   r   r[  r  r^   r^   r^   r_   rT    s
   %rT  c                   @   sZ   e Zd ZU ee ed< edef ed< dd Zdd Z	dd	 Z
e
Zed
d Zdd ZdS )GenericViewr   .rv   c                 C   s   | j S ra   )rv   r   r^   r^   r_   r)  D  s    zGenericView.make_reindexerc                 C   sB   dd t t| jD }t| |}ddtt| d| S )Nc                 S   s   g | ]}t tj|qS r^   )r@   r(   r%  )rl   r  r^   r^   r_   rp   H  s    z+GenericView.reindex_str.<locals>.<listcomp>zlambda , : )rs   rr   r   rI   rv   r   r   ri   )r   Z	index_oldZ	index_newr^   r^   r_   reindex_strG  s
    zGenericView.reindex_strc                 C   s$   |  | jd| j d|   gS )Nsize=zreindex=)r   r  r   r_  r   r^   r^   r_   r  N  s    zGenericView.__str__c                 C   s   | |t ||S ra   )rI   )r  r   r;  rv   r^   r^   r_   r  U  s    zGenericView.createc                 C   s   | j S ra   r  r   r^   r^   r_   r   Y  s    zGenericView.get_sizeN)r   r   r   r	   r   r   r   r   r)  r_  r  r#  r$  r  r   r^   r^   r^   r_   r\  ?  s   

r\  c                   @   sH   e Zd Zedd Zedd Zedd Zedd Zed	d
 Z	dS )r  c                 C   s<   t | } t |}tjjjj}|t | dr8| | } | S r   )rS   r  rD   r   r   	shape_envevaluate_exprLt)r   r   rb  r^   r^   r_   handle_negative_index_  s    

zView.handle_negative_indexc           	         s   t |ttfst| | |\ }tjj	 |r:|S d}t
t dks^t
t|dkrbd}d|kr fdd}| |t||S t|s|r|rt|st|}t|\}}t|j|j|t||j}t||S |  |}| |t||S )NFr   Tc                    s   t dgt  S r   )rJ   rr   rt   r<  r^   r_   fake_reindexz  s    z!View.create.<locals>.fake_reindex)rH   rJ   rI   rX   resolve_negative_sizer   rD   r   r   Zstatically_known_list_equalsrr   r!   r  ExternKernelrealize_input as_contiguous_storage_and_layoutr  r   r   r   r   r?  r#  r  )	r  r   r;  Zunbacked_symbols_in_sizesrf  r@  rA  rD  rv   r^   re  r_   r  h  s6    

zView.createc                 C   s   dd |D }dd | D } t |}tt|D ]8}|| dkr0td||< tt| t|||<  qjq0tjj	
t| t| | |fS )Nc                 S   s   g | ]}t jj|qS r^   rD   r   r   r  r  r^   r^   r_   rp     s     z.View.resolve_negative_size.<locals>.<listcomp>c                 S   s   g | ]}t jj|qS r^   rk  r  r^   r^   r_   rp     s     r  r)   )rI   rs   rr   rS   r   r%   rA   rD   r   r   guard_equals)r<  r;  rm   r^   r^   r_   rg    s    zView.resolve_negative_sizec              	   C   sZ   z|  ||}W nD ttfk
rT   t|g}|  ||}|  ||}t||}Y nX |S ra   )_dynamic_reshape_indexerrX   
IndexErrorrA   r   )r  r<  r;  rv   Zflatr~   r   r^   r^   r_   r    s    
zView.dynamic_reshape_indexerc                    s>  t jjj}dd tt|D  tt |}t| }g |r|r| }| \}}|dkr	t
d |	||f q:|dkr|	| q:||||kr	| t jj|| q:||||k r$||||k r| \}}	|| | }||	 }qԈ	| t jj|| q:||||krt
d}
|}	t||
| |
| }
||||kr| }	t||
| |
| }
|| }q^t jj|| q:tq:|r| }t jj|d 	t
d q|r| \}}t jj|d q  tt| ks,t fdd}|S )zG
        Perform a reshape entirely by modifying indexing math
        c                 S   s   g | ]}t tj|qS r^   )r@   r(   ZVIEWrk   r^   r^   r_   rp     s    z1View._dynamic_reshape_indexer.<locals>.<listcomp>r)   r   c                    sH   t | t ks$tt | t ftt|  t fddD S )Nc                 3   s   | ]}t | V  qd S ra   rB   r  replacementsr^   r_   r     s     zAView._dynamic_reshape_indexer.<locals>.reindex.<locals>.<genexpr>)rr   rX   rK   rw   rJ   rt   r1  Z	view_exprrp  r_   rv     s    $z.View._dynamic_reshape_indexer.<locals>.reindex)rD   r   r   r   rs   rr   rI   rw   r  r  rS   r   rl  r'   rX   reverse)r<  r;  r   Z	stack_newZ	stack_oldZsize_oldvarZsize_newZvar2Z	size_new2divisormodulusrv   r^   rr  r_   rm    s`    





zView._dynamic_reshape_indexerN)
r   r   r   r   rd  r$  r  rg  r  rm  r^   r^   r^   r_   r  ]  s   

)

r  c                       s   e Zd ZU dZded<  fddZdd ZeZdd	 Zd
d Z	dd Z
edd Zdd Zdd Zdd Zdd Zdd Zdd Zeej dddZd"d d!Z  ZS )#r#  z*Pretend our storage has a different layoutLayoutr  c                    s&   t    t| jtr"| j | _d S ra   )r  r   rH   r  r&  r7  r   r  r^   r_   r     s    
zReinterpretView.__post_init__c                 C   s   |  | j| jgS ra   )r   r  r  r   r^   r^   r_   r    s
    zReinterpretView.__str__c                 C   s
   | j  S ra   r0  r   r^   r^   r_   r   	  s    zReinterpretView.get_namec                 C   s   | j jS ra   r  r   r   r^   r^   r_   r   		  s    zReinterpretView.get_devicec                 C   s   d S ra   r^   r   r^   r^   r_   r  	  s    zReinterpretView.get_origin_nodec                 C   s   | j jS ra   )r  r   r   r^   r^   r_   r   	  s    zReinterpretView.dtypec                 C   s   t | jjS ra   rI   r  r   r   r^   r^   r_   r   	  s    zReinterpretView.get_sizec                 C   s   t | jjS ra   rI   r  r   r   r^   r^   r_   r   	  s    zReinterpretView.get_stridec                    s    fdd}|S )Nc                    s    j  }t  || S ra   )r  r   rC   loadr   rn   r0  r   r^   r_   r2  	  s    
z+ReinterpretView.make_loader.<locals>.loaderr^   r-  r^   r   r_   r   	  s    zReinterpretView.make_loaderc                 C   s
   | j  S ra   r  r   r   r^   r^   r_   r    	  s    zReinterpretView.make_indexerc                 C   s   | j S ra   r  r   r^   r^   r_   r   #	  s    zReinterpretView.get_layoutc                 C   s   d S ra   r^   r   r^   r^   r_   r!  &	  s    zReinterpretView.freeze_layoutr   c                 C   s$   t | jjt | jjB t | jjB S ra   )r!   r  r   r   r?  r   r^   r^   r_   r   )	  s    


z(ReinterpretView.get_unbacked_symbol_usesNc                 C   s$   t jj| j| jj| jj| jj|S ra   )	rD   r   wrapper_codeZcodegen_reinterpret_viewr  r  r   r   r?  r   r^   r^   r_   r   0	  s    z!ReinterpretView.codegen_reference)N)r   r   r   __doc__r   r   r  r#  r   r   r  r8  r   r   r   r   r   r   r!  r   rS   r   r   r   r&  r^   r^   r  r_   r#    s$   

r#  c                   @   s&   e Zd Zedd ZedddZdS )		SliceViewc                    sv   t jj| | tdd ||fD r6dd  nfdd  fdd}||dd}|||}||fS )	zz
        Normalize start and end such that both are in the range
        [0, x.get_size()[dim]] and start <= end.
        c                 s   s   | ]}t |V  qd S ra   r   r  r^   r^   r_   r   G	  s     z0SliceView.normalize_start_end.<locals>.<genexpr>c                 S   s   t t | ||S ra   )rS   ZMinZMaxr   lowerupperr^   r^   r_   clampI	  s    z,SliceView.normalize_start_end.<locals>.clampc                    s      | ||S ra   )Zevaluate_minZevaluate_maxr  r   r^   r_   r  N	  s    c                    s$   | d kr|S  | }  | ||S ra   )rd  )r  r  r  r  )r  r  dim_sizer^   r_   
clamp_wrapQ	  s    z1SliceView.normalize_start_end.<locals>.clamp_wrapr   )rD   r   r   r   rB  )r  r   rO  startendr  r^   )r  r  r  r   r_   normalize_start_end>	  s    
zSliceView.normalize_start_endr)   Tc                    s  t dkstz"dkr6|dkr6dkr6|W S W n tk
rL   Y nX tjj}t| |rz| 	| |\}t
| d   < t|rt|\}}	t|	j}
|
   |
 < t|	j|	j|
|	j|	j    }t||S  fdd}t||dS )Nr   l    r)   c                    sD   t | t ks$td|  d t| } |     |  < | S )Nzwrong ndim rX  )rr   rX   rI   rt   rO  r;  r  stepr^   r_   rv   ~	  s    $z!SliceView.create.<locals>.reindex)r   rv   )rS   r  rX   	TypeErrorrD   r   r   rI   r   r  r&   r   r  r   r  r   r   r?  r#  r  )r  r   rO  r  r  r  r  r   r@  rA  rC  rD  rv   r^   r  r_   r  [	  s4    



zSliceView.createN)r)   T)r   r   r   r$  r  r  r^   r^   r^   r_   r  =	  s   
r  c                   @   sZ   e Zd ZU ejed< ejed< dd Zdd Zdd Z	d	d
 Z
dd Zdd Zdd ZdS )BaseConstantr   r   c                 C   s   dS Nr^   r^   r   r^   r^   r_   r   	  s    zBaseConstant.get_sizec                 C   s   | j S ra   r
  r   r^   r^   r_   r   	  s    zBaseConstant.get_devicec                 C   s   d S ra   r^   r   r^   r^   r_   r  	  s    zBaseConstant.get_origin_nodec                 C   s   d S ra   r^   r2  r^   r^   r_   r   	  s    zBaseConstant.mark_reusec                 C   s   dS r  r^   r   r^   r^   r_   r   	  s    z#BaseConstant.has_exceeded_max_readsc                 C   s   dS r  r^   r   r^   r^   r_   r   	  s    zBaseConstant.get_readsc                 C   s   dS r  r^   r   r^   r^   r_   r  	  s    zBaseConstant.is_externN)r   r   r   rM   r   r   r   r   r   r  r   r   r   r  r^   r^   r^   r_   r  	  s   


r  c                   @   sB   e Zd ZU eed< ejed< ejed< dd Zdd Z	dd	 Z
d
S )Constantr   r   r   c                    s    fdd}|S )Nc                    s   t  j jS ra   )rC   r)  r   r   rt   r   r^   r_   r2  	  s    z$Constant.make_loader.<locals>.loaderr^   r-  r^   r   r_   r   	  s    zConstant.make_loaderc                 C   s   d S ra   r^   r   r^   r^   r_   r   	  s    zConstant.realizec                 C   s   t | j| j|S ra   )r  r   r   r!  r^   r^   r_   r"  	  s    zConstant.constant_to_deviceN)r   r   r   r   r   rM   r   r   r   r   r"  r^   r^   r^   r_   r  	  s   


r  c                   @   s:   e Zd ZU eed< ejed< ejed< dd Zdd Z	dS )	IndexingConstantrn   r   r   c                    s    fdd}|S )Nc                    s   t  j jS ra   )rC   r  rn   r   rt   r   r^   r_   r2  	  s    z,IndexingConstant.make_loader.<locals>.loaderr^   r-  r^   r   r_   r   	  s    zIndexingConstant.make_loaderc                 C   s   t | j| j|S ra   )r  rn   r   r!  r^   r^   r_   r"  	  s    z#IndexingConstant.constant_to_deviceN)
r   r   r   r   r   rM   r   r   r   r"  r^   r^   r^   r_   r  	  s
   


r  c                 C   s    t dd t| t||D S )Nc                 s   s$   | ]\}}}|d kp||kV  qdS r  r^   )rl   leftrightr   r^   r^   r_   r   	  s   z2is_contiguous_strides_for_shape.<locals>.<genexpr>)r  rw   r   r   )r   shaper^   r^   r_   is_contiguous_strides_for_shape	  s      r  c                 C   s
   d| j  S )z
    CUDA max memory transaction size is 128 bytes for a warp.
    We pick `128 // dtype.itemsize` as alighment so GPU can do coalesced
    memory access.
    r  )itemsizer   r^   r^   r_   get_align_for_dtype	  s    r  c                	   @   s   e Zd Zedfejejee e	e
eeef   edddZedd Zdd ZeZd	d
 Zedd Zdd Zdd Zdd Zedd Zdd Zdd Zdd Zdd ZedddZejdd d!Z d"S )#rw  r   r   r   r   r   r?  c                 C   sd   |d ks,t |t |ks,td| d| || _|| _tdd |D sNt|| _|| _|| _d S )Nr`  	, stride=c                 s   s   | ]}t |ttfV  qd S ra   )rH   r   r   r   r^   r^   r_   r   	  s     z"Layout.__init__.<locals>.<genexpr>)rr   rX   r   r   r  r   _strider?  r   r   r   r   r   r?  r^   r^   r_   r  	  s    zLayout.__init__c                 C   s   | j S ra   )r  r   r^   r^   r_   r   	  s    zLayout.stridec                 C   sP   d}| j dkrd| j  }t| j d| jj d| j d| j d| j | dS )	Nr   r   z	, offset=z('z', z, size=r  ))r?  rY   r   r   r   r   r   )r   r?  r^   r^   r_   r  	  s
    
4zLayout.__str__c                 C   s   t | j| jS ra   )r  r   r   r   r^   r^   r_   r  	  s    zLayout.is_contiguousc                 C   sV   t | }|dks| d dkr dS t|t| | D ] \}}}|dkr0||kr0 dS q0dS )N)r      r)   FT)rr   rw   r   )r  r  ndimr  r  r   r^   r^   r_   is_channels_last_contiguous	  s      z"Layout.is_channels_last_contiguousc                 C   sB   t | jtt| j| jD ] \}}}|dkr||kr dS qdS )Nr)   FT)rw   r   reversedr   r   r   )r   r  r  r   r^   r^   r_   is_transposed	
  s    zLayout.is_transposedc                    s   t jt  kstdd tjD }fdd|D } fdd|D  dd }|  dgt   }tt  D ]}tjj	|| | | < qxtt  d D ]}|| ||d  kr d	S qd
S )Nc                 S   s*   g | ]"\}}t jjj|d ddkr|qS )r   r9  r)   rD   r   r   r   )rl   rm   rO  r^   r^   r_   rp   
  s   z,Layout.is_stride_ordered.<locals>.<listcomp>c                    s   g | ]} j | qS r^   rL  rk   r   r^   r_   rp   
  s     c                    s   g | ]} | qS r^   r^   rk   r{   r^   r_   rp   
  s     c                    s   t |   fdd| D S )Nc                    s   g | ]}  |qS r^   rt   )rl   elementZ
sorted_arrr^   r_   rp   "
  s     zDLayout.is_stride_ordered.<locals>.sorted_indices.<locals>.<listcomp>)r  )Zarrr^   r  r_   sorted_indices 
  s    z0Layout.is_stride_ordered.<locals>.sorted_indicesr  r)   FT)
rr   r   rX   r   r   rs   rD   r   r   r   )r   ry   Znon_1_indicesr   r  stride_orderedrm   r^   )ry   r   r_   r$  
  s    zLayout.is_stride_orderedc                 C   s:   dgt ttdt| jd  }t|g| }| |S r>  )rI   r  rs   rr   r   r$  r   ry   r^   r^   r_   is_channels_last_stride_ordered1
  s    "z&Layout.is_channels_last_stride_orderedc                 C   s2  t |}t| dkr| S tjs.t|| r.| S t }t|drR|j	
ddrR| S tdd t| |D sp| S t| }t|}dd tt| D }d	||d < d
}d}	t|d	d d	dD ]V\}
}||
d	  }|| ||  }||kr|| dkrt||| }d}	|||< q|	s | S t jd	7  _|S )z
        The padding does not change stride order but makes sure all strides larger
        than the threshold are multiple of align.
        r   metaZdislike_paddingFc                 s   s   | ]}t |ttjfV  qd S ra   rk  r   r^   r^   r_   r   Q
  s   z&Layout._pad_strides.<locals>.<genexpr>c                 S   s   g | ]}d qS r   r^   r   r^   r^   r_   rp   Z
  s     z'Layout._pad_strides.<locals>.<listcomp>r)   r  N)r  T)r  rr   r*   Zpad_channels_lastrw  r  rD   Zget_current_nodehasattrr  getr  r  chainr   r   rs   r   r6   r   Znum_comprehensive_padding)Z
in_stridesr   r   alignZcurrent_fx_noder  r   Znew_stridesZalign_stride_thresholdpaddedZrankr   Zprev_idxr   r^   r^   r_   _pad_strides7
  sF    
  

zLayout._pad_stridesc                 C   s6   t | tst| jd k	st| | j| j| j| _d S ra   )rH   r   rX   r  r  r   r   r   r^   r^   r_   r  
  s    zLayout.pad_stridesc                 C   s   t jot| tS ra   )r*   Zcomprehensive_paddingrH   r   r   r^   r^   r_   r  
  s    zLayout.should_pad_stridesc                 C   s8   t | tr| S |  r|   t| j| j| j| j| jS ra   )	rH   r  r  r  r   r   r   r   r?  r   r^   r^   r_   as_fixed
  s    
zLayout.as_fixedc                 C   s(   t jstdt| j d|   S )Nzconvert z to FixedLayout first)r   r  rX   rY   r   r  r   r   r^   r^   r_   r   
  s
    zLayout.make_indexerr   c                 C   s<   | j |j ko:| j|jko:| j|jko:| j|jko:| j|jkS ra   r  )r   otherr^   r^   r_   __eq__
  s    



zLayout.__eq__c                 C   s   t | j| j| jS ra   )r   r   r   r?  r   r^   r^   r_   storage_size
  s    zLayout.storage_sizeN)!r   r   r   r   rM   r   r   r	   r   r
   r   r   r   r  r8  r   r  r#  r  r   r  r  r$  r  r  r  r  r  r   r   r  rS   r  r^   r^   r^   r_   rw  	  s4   
	


K	rw  c                	       sl   e Zd ZdZdedfejejee	e
 e	e f eeee
ef   ee
ef d fddZdd Z  ZS )	r  z A Tensor layout we cannot changeNr   r  c                    s*   |d krt |}t ||||| d S ra   )r   r   r  r  r  r  r^   r_   r  
  s    
zFixedLayout.__init__c                    s    fdd}|S )z1A closure containing math to read a given elementc                    sf   t | t  jkstt | t  jks,t j}t|  j jD ]\}}}|dkrB|||  }qB|S r  )rr   r   rX   r   r?  rw   )rn   r  r   r   szr   r^   r_   r0  
  s    z)FixedLayout.make_indexer.<locals>.indexerr^   r,  r^   r   r_   r   
  s    	zFixedLayout.make_indexer)r   r   r   r  r   rM   r   r   r   r	   r   r   r
   r   r  r   r&  r^   r^   r  r_   r  
  s   
r  c                       s|   e Zd ZdZdZedd Zedd Zedd Zed	d
 Z	edd Z
dddZdd Zdd Zd fdd	Z  ZS )r   z(A Tensor layout we are allowed to changeFc                 C   sP   t | dkrg S tdg}t| dd  D ]}|||d   q,tt|S )Nr   r)   r  )rr   rS   r   r  r  rI   )sizesZreversed_stridesr   r^   r^   r_   r   
  s    z!FlexibleLayout.contiguous_stridesc                 C   sV   t tt| t |ksttd}dgt| }|D ]}|||< || |  }q8|S )z
        Create a stride based on the order the dimensions should be filled in.

        In this format, channels last would be:
            [1, 3, 2, 0]
        r)   N)r   rs   rr   rX   rS   r   )r  ry   Znext_strider  rm   r^   r^   r_   fill_ordered
  s    
zFlexibleLayout.fill_orderedc                 C   s0   t tt| t |kstt|}t| |S )z
        Create a stride based on the sorted order of a permuted range.

        In this format, channels last would be:
            [3, 0, 2, 1]
        )r   rs   rr   rX   r   r   r  )r  ry   r   r^   r^   r_   r  
  s    zFlexibleLayout.stride_orderedc                 C   sT   |t jkrt| tS |t jkr,t| tS |t jkr@t| S t	
d| tdS )aq  
        Create a stride based on a memory format.

        Memory format is translasted into a stride order,
        so channels_last is the same as:
            FlexibleLayout.stride_ordered(sizes, [3, 0, 2, 1])

        This interface does not support memory_format `torch.preserve_format`
        which should be used to deduce a format from another source
        z>stride_ordered_for_memory_format, unsuppored memory_format: %sN)rM   channels_lastr   r  NHWC_STRIDE_ORDERchannels_last_3dNHWDC_STRIDE_ORDERZcontiguous_formatr   r  r  r   )r  memory_formatr^   r^   r_    stride_ordered_for_memory_format
  s    



z/FlexibleLayout.stride_ordered_for_memory_formatc                 C   sD   t | t |kstdd |D }ttt ||jd}t| |S )z
        Create a stride that has the same stride order as given stride

        For example, if given stride is [1000, 1, 100, 10],
        the fill order should be [1, 3, 2, 0]
        c                 S   s   g | ]}t jj|qS r^   r  r  r^   r^   r_   rp     s     z/FlexibleLayout.same_ordered.<locals>.<listcomp>ru  )rr   rX   r  rs   __getitem__r   r  )r  r   r   r^   r^   r_   same_ordered  s    zFlexibleLayout.same_orderedc                 C   sD   |  | j|}|  r,|r,| || j| j}t| j| j| j|| jS ra   )r  r   r  r  r   r  r   r?  )r   ry   r  rC  r^   r^   r_   as_stride_order"  s    zFlexibleLayout.as_stride_orderc                 C   s@   |  | j|}|  r(| || j| j}t| j| j| j|| jS ra   )r  r   r  r  r   r  r   r?  )r   ry   rC  r^   r^   r_   as_fill_order/  s    zFlexibleLayout.as_fill_orderc                 C   s@   |  | j|}|  r(| || j| j}t| j| j| j|| jS ra   )r  r   r  r  r   r  r   r?  )r   r   rC  r^   r^   r_   as_same_order;  s    zFlexibleLayout.as_same_orderNc                    s2   |rt ||}n
t |}t |||| d S ra   )r   r  r   r  r  )r   r   r   r   r  r  r  r^   r_   r  G  s    
zFlexibleLayout.__init__)F)N)r   r   r   r  r  r   r   r  r  r  r  r  r  r  r  r&  r^   r^   r  r_   r   
  s    





r   c                       s>   e Zd ZdZeedf d fddZdd Zdd	 Z  Z	S )
NonOwningLayoutz,Is a view into the storage of another tensorrR   )viewc                    s,   |  }t |j|j|j|j || _d S ra   )r   r  r  r   r   r   r   r  )r   r  r  r  r^   r_   r  R  s    zNonOwningLayout.__init__c                 C   s   |    S ra   )r  r   r   r^   r^   r_   r   \  s    zNonOwningLayout.make_indexerc                 C   s4   | j  j}|dkrdS ddlm} tjj||S )Nr   Tr)   )	ALIGNMENT)	r  r   r?  Z
compile_fxr  rD   r   r   Zstatically_known_multiple_of)r   r?  r  r^   r^   r_   maybe_guard_aligned_  s
    z#NonOwningLayout.maybe_guard_aligned)
r   r   r   r  r   r&  r  r   r  r&  r^   r^   r  r_   r  O  s   
r  c                   @   s$   e Zd Zdd Zdd Zdd ZdS )
NoneLayoutc                 C   s   || _ dg| _dg| _d S r   )r   r   r   r!  r^   r^   r_   r  q  s    zNoneLayout.__init__c                 C   s   dS r   r^   r   r^   r^   r_   r  v  s    zNoneLayout.storage_sizec                 C   s   | S ra   r^   r   r^   r^   r_   r  y  s    zNoneLayout.as_fixedN)r   r   r   r  r  r  r^   r^   r^   r_   r  h  s   	r  c                       sv   e Zd Zed fddZejjdd Zej	dddZ
d	dd
dZdd ZedddZdd Zdd Z  ZS )MutationLayoutSHOULDREMOVErH  c                    s@   t  | | | d  || _|   }tj	
| d S ra   )r  r  r   r   r   rH  
get_bufferr   rD   r   mark_buffer_mutated)r   rH  rg   r  r^   r_   r  ~  s    z#MutationLayoutSHOULDREMOVE.__init__c                 C   s
   |   jS ra   )real_layoutr   r   r^   r^   r_   r     s    z!MutationLayoutSHOULDREMOVE.strider   c                 C   s   |    S ra   )r  r  r   r^   r^   r_   r    s    z'MutationLayoutSHOULDREMOVE.storage_sizer   c                    s,    fdd  | j }t|ts(td|S )Nc                    sB   t | tr | jS t | tr* |  S t | tr> | jS | S ra   )rH   r  rH  r&  r7  
MutableBoxr  r  unwrap_viewsr^   r_   r    s    




z;MutationLayoutSHOULDREMOVE.get_buffer.<locals>.unwrap_viewsz1MutationLayoutSHOULDREMOVE must refer to a buffer)rH  rH   r   rX   )r   r  r^   r  r_   r    s    	
 z%MutationLayoutSHOULDREMOVE.get_bufferc                 C   s
   |   jS ra   )r  r  r   r^   r^   r_   r    s    z&MutationLayoutSHOULDREMOVE.real_layoutFc              	   C   s   |   tj|  t|tr(|j}|  |slt	j
| | | dd t| | D dj}|   t|jjtstt||j_|jS )Nc                 S   s    g | ]\}}t jj||qS r^   rD   r   r   rl  rl   rQ  rR  r^   r^   r_   rp     s   z;MutationLayoutSHOULDREMOVE.realize_into.<locals>.<listcomp>r  )r   rD   r   r  r   rH   rR   r  r   r,  r  r   r   r   rw   r   r  r   rX   r  )r  srcdstZunsafe_aliasr^   r^   r_   realize_into  s$    

z'MutationLayoutSHOULDREMOVE.realize_intoc                 C   s   | S ra   r^   r   r^   r^   r_   r    s    z#MutationLayoutSHOULDREMOVE.as_fixedc                 C   s
   | j  S ra   )rH  r   r   r^   r^   r_   r     s    z'MutationLayoutSHOULDREMOVE.make_indexer)F)r   r   r   r   r  rw  r   getterrS   r   r  r  r  r$  r  r  r   r&  r^   r^   r  r_   r  }  s   
"r  c                       s@  e Zd ZU ee ed< eed<  fddZdd Zeddd	Z	d
d Z
dd Zedd Zdd Zdd Zdd Zdd Zdd Zdd Zdd ZdBdd Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* ZdCd,d-Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Z e!e"j# dd8d9Z$e!e"j# dd:d;Z%d<d= Z&d>d? Z'd@dA Z(  Z)S )Dr   rg   r  c                    s   t    d | _d S ra   r  r   r  r^   r_   r     s    
zBuffer.__post_init__c                 C   s
   | j  S ra   r}  r   r^   r^   r_   r     s    zBuffer.make_indexerr   c                 C   s   | j st| | j S ra   )rg   rX   r   r^   r^   r_   r     s    zBuffer.get_namec                 C   s   | j jS ra   rx  r   r^   r^   r_   r     s    zBuffer.get_devicec                 C   s   | j S ra   r  r   r^   r^   r_   r    s    zBuffer.get_origin_nodec                 C   s   t | jdd S )Nr   )rb   r  r   r^   r^   r_   r     s    zBuffer.dtypec                 C   s   t | jjS ra   ry  r   r^   r^   r_   r     s    zBuffer.get_sizec                 C   s   t | jjS ra   rz  r   r^   r^   r_   r     s    zBuffer.get_stridec                 C   s   | j jS ra   )r  r?  r   r^   r^   r_   
get_offset  s    zBuffer.get_offsetc                 C   s   | j S ra   r~  r   r^   r^   r_   r     s    zBuffer.get_layoutc                 C   s   |   S ra   )r   r   r^   r^   r_   r     s    zBuffer.get_storage_numelc                 C   s   dS r  r^   r   r^   r^   r_   r    s    zBuffer.is_externc                 C   s    t | jttfs| j | _d S ra   )rH   r  MultiOutputLayoutr  r  r   r^   r^   r_   r!    s    zBuffer.freeze_layoutFc                 C   s&   t | jtst| jj||d| _d S )Nr  )rH   r  r   rX   r  )r   ry   r  r^   r^   r_   r"    s    z&Buffer.freeze_layout_with_stride_orderc                 C   s"   t | jtst| j|| _d S ra   )rH   r  r   rX   r  r  r^   r^   r_   freeze_layout_with_fill_order  s    z$Buffer.freeze_layout_with_fill_orderc                 C   s"   t | jtst| j|| _d S ra   )rH   r  r   rX   r  )r   r   r^   r^   r_   freeze_layout_with_same_order
  s    z$Buffer.freeze_layout_with_same_orderc                 C   s   t jjt|  dS r   r   r   r^   r^   r_   r     s    zBuffer.is_zero_elementsc                    s(      rtt  dS  fdd}|S )Nr   c                    s    j  }t j|| S ra   )r  r   rC   r{  rg   r|  r   r^   r_   r2    s    
z"Buffer.make_loader.<locals>.loader)r   r   r+  r   r-  r^   r   r_   r     s    zBuffer.make_loaderc                 C   s   dS r  r^   r   r^   r^   r_   is_no_op  s    zBuffer.is_no_opNc                 C   s   |   S ra   r   r   r^   r^   r_   r     s    zBuffer.codegen_referencec                 C   s   d S ra   r^   r   r^   r^   r_   r  "  s    zBuffer.decide_layoutc                 C   s   t | jtr| jj gS dS r  )rH   r  r  r  r   r   r^   r^   r_   get_inputs_that_alias_output%  s    z#Buffer.get_inputs_that_alias_outputc                 C   s   t | jtr| jj gS dS r  )rH   r  r  rH  r   r   r^   r^   r_   get_mutation_names*  s    zBuffer.get_mutation_namesc              
   C   s8   t tdd  t|  |  W  5 Q R  S Q R X d S r  )r   r  r   r/   r   r   r   r^   r^   r_   r  /  s
    zBuffer.get_read_writesc                 C   s
   |   jS ra   )r  r   r   r^   r^   r_   r   6  s    zBuffer.get_readsc                 C   s   t  S ra   r   r   r^   r^   r_   get_unbacked_symbol_defs9  s    zBuffer.get_unbacked_symbol_defsc                 C   s   t  S )a  
        Returns the unbacked symbols which are required to be in scope in
        order to successfully perform codegen for this buffer.  For example,
        a buffer that corresponds to an extern kernel call that takes i0 as
        an argument would return {i0} here.  This is used to generate necessary
        dependencies that ensure we actually bind i0 in codegen before you
        try to use it.

        Note that this is NOT transitive; in particular, if this buffer takes
        in as input another buffer with dynamic shape (e.g., (i0,)), we will
        not report it here, because you will already have a dependency
        on that buffer, which will eventually have a dependency on i0 if
        necessary.
        r  r   r^   r^   r_   r   <  s    zBuffer.get_unbacked_symbol_usesc                 C   s   d S ra   r^   r   r^   r^   r_   r   M  s    zBuffer.realizec                 C   s   dS )z
        Gets extra global memory size needed by this buffer.
        Some algorithms (e.g. group gemm) may require extra global memory in the generated code.
        r   r^   r   r^   r^   r_   get_workspace_sizeP  s    zBuffer.get_workspace_sizec                 C   s   dS r  r^   r   r^   r^   r_   should_allocateW  s    zBuffer.should_allocate)F)N)*r   r   r   r
   ri   r   rw  r   r   r   r   r  r8  r   r   r   r  r   r   r  r!  r"  r  r  r   r   r  r   r  r  r  r  r   r   rS   r   r  r   r   r  r  r&  r^   r^   r  r_   r     sB   



r   c                   @   s   e Zd ZdS )InputBufferNr  r^   r^   r^   r_   r  \  s   r  c                   @   s0   e Zd ZU dZeej ed< dd Zdd Z	dS )r7  Nr6  c                    s    fdd}|S )Nc                    s*    j  }ttj   j|| S ra   )	r  r   rC   r{  rD   r   constant_namer   r6  r|  r   r^   r_   r2  d  s
    
z*ConstantBuffer.make_loader.<locals>.loaderr^   r-  r^   r   r_   r   c  s    zConstantBuffer.make_loaderc                 C   s   t tj|  || jS ra   )r7  rD   r   r  r   r  r!  r^   r^   r_   r"  m  s     z!ConstantBuffer.constant_to_device)
r   r   r   r6  r
   rM   r   r   r   r"  r^   r^   r^   r_   r7  `  s   

r7  c                   @   s*   e Zd Zeej dddZdddZdS )NoneAsConstantBufferr   c                 C   s   t  S ra   r  r   r^   r^   r_   r   t  s    z-NoneAsConstantBuffer.get_unbacked_symbol_usesNc                 C   s
   t jjjS ra   )rD   r   r  none_strr   r^   r^   r_   r   w  s    z&NoneAsConstantBuffer.codegen_reference)N)r   r   r   r   rS   r   r   r   r^   r^   r^   r_   r  s  s   r  c                       s:   e Zd Z fddZeej dddZd	ddZ  Z	S )
ShapeAsConstantBufferc                    s   t    || _d S ra   )r  r  r  )r   r  r  r^   r_   r  |  s    
zShapeAsConstantBuffer.__init__r   c                 C   s
   t | jS ra   )r!   r  r   r^   r^   r_   r     s    z.ShapeAsConstantBuffer.get_unbacked_symbol_usesNc                 C   s   t jjt jj| jS ra   )rD   r   r  Zexpr_printerr   r  r  r   r^   r^   r_   r     s    z'ShapeAsConstantBuffer.codegen_reference)N)
r   r   r   r  r   rS   r   r   r   r&  r^   r^   r  r_   r  {  s   r  c                       s   e Zd ZU eed< dd Zedd Zdd Ze	e
j dd	d
Z fddZdd Zdd Zdd Zedd Zd%eeeeef ee f  dddZed&ddZdd Zdd Zdd  Zd!d" Zd#d$ Z  ZS )'r  r  c                 C   s(   | j dk	r| j S t| jdr$| jj S dS )z
        Returns self.name if it exists, otherwise returns the name of the data node if that exists.
        If neither exist, returns None.
        Nrg   )rg   r  r  r   r^   r^   r_   get_computed_buffer_name  s
    
z'ComputedBuffer.get_computed_buffer_namec                 C   s   t |  jS ra   )rr   r  r   r   r^   r^   r_   	num_reads  s    zComputedBuffer.num_readsc              
   C   sp   t tddX | j rBt|  | j | j W  5 Q R  S t|  | j	 W  5 Q R  S W 5 Q R X d S r  )
r   r  r   r  r  r/   get_store_functionr  r  r   r   r^   r^   r_   r    s    
zComputedBuffer.get_read_writesr   c                 C   s.   t |  t |  B t |  B | j B S ra   )r!   r   r   r  r  r   r   r^   r^   r_   r     s    


z'ComputedBuffer.get_unbacked_symbol_usesc                    s:   t | jdr0| jtjjkr0|  dkr0| j S t  S )Nr   r   )	r  r  rg   rD   r   Zmutated_buffersr  r   r  r   r  r^   r_   r     s    


zComputedBuffer.make_loaderc                 C   sV   | j   }t| jttfr0t| jj| j	|S t| jt
s@tt| jj| j	|S d S ra   )r  r  r   rH   r  r]  r   r   rc  rg   r,  rX   r3  r,  r^   r^   r_   r    s
    z!ComputedBuffer.get_store_functionc                    s   t | jtrt| j | j \\}}|  j	}dd |D }t
dd |D sZtfdd|D }|rt | jtr| j| n|  fdd|D }ddlm} |||  S d	S )
al  
        If our layout is still flexible, try to determine the stride order based on stride orders of reads.

        TODO(jansel): A better algorithm here would look at downstream consumers of this
                      value and try to do global graph-level layout optimization.
                      This is also something just begging to be autotuned.
        c                 S   s0   g | ](}|j tjj kr(tjj|j  nd qS ra   )rg   rD   r   r  r  r  r^   r^   r_   rp     s   z1ComputedBuffer.get_fill_order.<locals>.<listcomp>c                 s   s    | ]}t |tjtjfV  qd S ra   )rH   r+   StarDep	MemoryDepr  r^   r^   r_   r     s   z0ComputedBuffer.get_fill_order.<locals>.<genexpr>c                    s.   g | ]&}t |tjrt|jd d  D qS )c                 S   s    i | ]}|d kr|t d qS r   r  rl   vr^   r^   r_   r     s       z<ComputedBuffer.get_fill_order.<locals>.<listcomp>.<dictcomp>)rH   r+   r  rB   rn   r  )rd  r^   r_   rp     s    c                    s   g | ]}t jj| qS r^   rD   r   r   r  rl   exprr  r^   r_   rp     s    r)   pick_loop_orderN)rH   r  r   r+   r  r  r  r  r  r   r  rX   r   rv   	schedulerr  r   )r   
index_varsr   r   
reads_bufsZstride_lengthsr  r^   )r  rd  r_   get_fill_order  s2     


zComputedBuffer.get_fill_orderc                 C   s0   t | jtr,|  }|r$| | n|   d S ra   )rH   r  r   r  r  r!  r  r^   r^   r_   r    s
    zComputedBuffer.decide_layoutc           
   	   C   s   t j| j | j dd\}}ttd|  * t	| 
 |  rH|n
|d d |}W 5 Q R X g }g }g }g }| D ]V\}}	||d kr|rt|| ||	 qz||d kst|| ||	 qz||f|||ffS )NqrF   r6  r)   r   )r+   r  r  r  r  r   r  r7  r   LoopBodyr  r  itemsrX   r  )
r   rd   
var_rangesr  r  reduce_vars
index_sizereduce_sizer  r   r^   r^   r_   get_default_sizes_body  s0      


z%ComputedBuffer.get_default_sizes_bodyNextra_indexing_constraintsc                    sR    \\}}}\}}|j  |dk	rt|tr@t|dksDt|\}}t|tsZtt|tshtt	dd |D s~t|j
}	|	|kst|	|f fdd|D } |7  dd |j D }
|j |j  fdd	}|| }||||\}}}||||\}}}tj||d
d\\}}}t|||||g|}||f|fS )a  
        This is a main place where we do loop transformations in a
        backend-agnostic way.

        Here we:
            1) Remove any 1 dimensions
            2) Fuse contiguous dimensions together
            3) Reorder dimensions based on stride orders

        Optional argument extra_indexing_constraints can be used to append additional
        indexing expressions to existing ones derived from buffer's body. This can be useful
        to fuse scheduler nodes with compatible ranges, e.g. (s0*s1*...,) and (s0, s1, s2, ...)
        on CPU by preventing indexing simplifications and obtaining index/reduce ranges for
        the scheduler node compatible with other nodes.
        Nr   c                 s   s   | ]}t |tV  qd S ra   )rH   r   )rl   fr^   r^   r_   r   G  s     z6ComputedBuffer.simplify_and_reorder.<locals>.<genexpr>c                    s   g | ]}| kr|qS r^   r^   r   )index_formulasr^   r_   rp   O  s     z7ComputedBuffer.simplify_and_reorder.<locals>.<listcomp>c                 S   s,   g | ]$}|t jj kr$t jj| nd qS ra   )rD   r   r  r  )rl   Z
reads_namer^   r^   r_   rp   T  s   c                    sZ    | ||\}}}|| } tjj| |t | |\}}}|| } t||}|||fS ra   )_apply_loop_reorderingrD   r   r   _simplify_loopsr,   r   )Zx_varssupport_varsr  Zreindex0r~   r   prunerv   r  memory_addrsr   r^   r_   simplify_and_reorder_  s       



zAComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorderzrF   )r  indexing_exprsrL   rH   rJ   rr   rX   rK   rI   r  r  reads_name2exprr  writes_name2exprr+   Zindex_vars_no_squeezer  )r   r  r  r  r  r  r  Zextra_indexing_rangesZextra_indexing_exprZexpected_var_rangesr  r
  r  Ziter_rangesZiter_reindexr   Zreduce_rangesZreduce_reindexZ	iter_varsr  r^   r  r_   r
  %  sj    



  
    z#ComputedBuffer.simplify_and_reorderc              
      s   ddl m} |dkrg }zT fdd|D }t|t|krRt|d t ksVttt|||}W nB tk
r   tjrt	
dtt | ttt}Y nX fdd|D t|t|fS )	zU
        Shuffle the order of loops around to hopefully improve performance.
        r)   r  Nc                    s   g | ]}t jj| qS r^   r  r  )r  r  r^   r_   rp     s   z9ComputedBuffer._apply_loop_reordering.<locals>.<listcomp>r   z%Did not simplify complex index:
%s
%sc                    s   g | ]} | qS r^   r^   rk   )r  r^   r_   rp     s     )r  r  rr   rX   rI   r  	Exceptionr*   r  r  warningrK   rw   rs   r|   rz   )r  r  r  r	  Zpriority_idxr  r  ry   r^   )r  r  r  r_   r    s*    
z%ComputedBuffer._apply_loop_reorderingc                 C   s
   | j  S ra   )r  r  r   r^   r^   r_   r    s    z!ComputedBuffer.get_reduction_sizec                 C   s
   | j  S ra   )r  r  r   r^   r^   r_   r    s    z!ComputedBuffer.get_reduction_typec                 C   s
   | j  S ra   )r  r   r   r^   r^   r_   r    s    zComputedBuffer.is_no_opc                 C   s   dS NTr^   r   r^   r^   r_   r    s    zComputedBuffer.should_allocatec                 C   s   | j |S )r5  )r  r"  r!  r^   r^   r_   r"    s    z!ComputedBuffer.constant_to_device)N)N) r   r   r   r   r   r  r5   r  r  r   rS   r   r   r   r  r  r  r  r
   r   r   r   r	   r
  r   r  r  r  r  r  r"  r&  r^   r^   r  r_   r    s0   

/
 _ #r  c                       sx   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
deeeeef ee f  dddZ  ZS )TemplateBufferzt
    Represents a Triton (in the future other type) of template operator
    that we can fuse an epilogue onto.
    c                    s4   t  jd |d t|| _|| _tj| | _	d S )N)rg   r  )
r  r  InputsKernelunwrap_storageinputsmake_kernel_renderrD   r   register_bufferrg   )r   r  r  r  r  r^   r_   r    s    zTemplateBuffer.__init__c                 C   s   |   S ra   )normalized_read_writesr   r^   r^   r_   r    s    zTemplateBuffer.get_read_writesc                    sL   |   | j   fdd}tj||  ddd}dd | jD |_|S )Nc                    s"   t |dkstt | dS )Nr   Zfake)rr   rX   rC   r-  r  r0  rg   r^   r_   dummy  s    z4TemplateBuffer.normalized_read_writes.<locals>.dummyr^   T)	normalizec                 S   s   h | ]}t | qS r^   r+   r  r   r  r^   r^   r_   r     s     z8TemplateBuffer.normalized_read_writes.<locals>.<setcomp>)r   r  r   r+   r/   r   r  r   )r   r  depsr^   r  r_   r    s    
   z%TemplateBuffer.normalized_read_writesc                 C   s   dS r  r^   r   r^   r^   r_   r    s    z!TemplateBuffer.get_reduction_sizec                 C   s   d S ra   r^   r   r^   r^   r_   r    s    z!TemplateBuffer.get_reduction_typec                 C   s   dS r  r^   r   r^   r^   r_   r    s    zTemplateBuffer.is_no_opc                 C   s   dS r  r^   r   r^   r^   r_   r    s    zTemplateBuffer.should_allocateNr   c                 C   s   |   dfd fS r  r1  )r   r  r^   r^   r_   r
    s
    z#TemplateBuffer.simplify_and_reorder)N)r   r   r   r  r  r  r  r  r  r  r  r
   r   r   r   r	   r
  r&  r^   r^   r  r_   r    s    r  c                       s4   e Zd Zdeee  d fddZdd Z  ZS )TritonTemplateBufferN)mutated_inputsc                    sp   t  ||| || _|| _|dk	rltjjjtjjjh}t	j
jj}||ks^td| d| t| f|  dS )a  
        NOTE:[TritonTemplates with multiple outputs]
        We want the ability for TritonTemplates to output multiple tensors. Triton
        kernels have no notion of outputs and this is done by creating tensors that
        are then mutated by the kernel. Currenlty our STORE_OUTPUT codegen doesn't
        support creating multinode outputs for triton templates.
        We work around this by creating an extra input buffer during the lowering
        and we mark them as mutated inputs.
        Nz$Mutated inputs are only allowed for z	 but got )r  r  debug_extrar  rM   rC   Zhigher_orderZflex_attentionZflex_attention_backwardrD   r   current_noderH  rX   mark_node_as_mutating)r   r  r  r  r   r  Zallowed_setr!  r  r^   r_   r    s    
zTritonTemplateBuffer.__init__c                 C   s   d| j  d| j d}|S )NzTritonTemplateBuffer(layout=r]  r  )r  r   )r   r   r^   r^   r_   r    s    zTritonTemplateBuffer.__str__)NN)	r   r   r   r
   r   r   r  r  r&  r^   r^   r  r_   r    s     
 r  c                       s   e Zd ZdZ fddZedddZedddZd	d
 Z	edddZ
ddddZeeeeee f f dddZ  ZS )ChoiceCallera.  
    Represents a possible choice used in autotune_process.py.
    During autotuning, self.benchmark() is first called to get benchmark result,
    and if this choice is selected, self.output_node() is called to get the output_node.

    Children classes: TritonTemplateCaller, CUDATemplateCaller.
    c                    s    t    || _|| _|| _d S ra   )r  r  rg   r  input_nodes)r   rg   r$  r  r  r^   r_   r  !  s    
zChoiceCaller.__init__r   c                G   s   |   }t||d|iS )Nr   )to_callabler3   )r   r   rd   algor^   r^   r_   	benchmark'  s    zChoiceCaller.benchmarkc                 C   s   t d S ra   r(  r   r^   r^   r_   	call_name+  s    zChoiceCaller.call_namec                 C   s   t d S ra   r(  r   r^   r^   r_   r%  .  s    zChoiceCaller.to_callablec                 C   s   t d S ra   r(  r   r^   r^   r_   hash_key1  s    zChoiceCaller.hash_keyrR   c                 C   s   t d S ra   r(  r   r^   r^   r_   output_node4  s    zChoiceCaller.output_nodec                 C   s   i S )zRInformation returned here is logged to the autotune log file when that is enabled.r^   r   r^   r^   r_   	info_dict7  s    zChoiceCaller.info_dict)r   r   r   r  r  r*  r'  ri   r(  r%  r)  r*  r   r   PrimitiveInfoTyper	   r+  r&  r^   r^   r  r_   r#    s   r#  c                   @   s   e Zd ZedddZdS )TritonTemplateCallerBaser   c                 C   s   t d S ra   r(  r   r^   r^   r_   get_make_kernel_render=  s    z/TritonTemplateCallerBase.get_make_kernel_renderN)r   r   r   r   r.  r^   r^   r^   r_   r-  <  s   r-  c                       s   e Zd ZdZeee eg ee	e
f f d fddZeee	e
f dddZejedd	d
ZedddZee	e
f dddZ  ZS )MultiTemplateBufferaG  
    Represents a Buffer with multiple backing implementation choices.

    Choices can be TritonTemplates or ExternKernels. During scheduling if there is a potential
    epilogue we will benchmark each of the choices with the epilogue to determine an implementation.
    Otherwise, the fastest base choice will be chosen.
    )r  r  choice_timingsc                    s(   t  j||d d || _d | _|| _d S )N)r  r  r  )r  r  _choice_timings_fn_choice_timingsZoriginal_inputs)r   r  r  r0  r  r^   r_   r  J  s    zMultiTemplateBuffer.__init__r   c                 C   s   | j d kr|  | _ | j S ra   )r2  r1  r   r^   r^   r_   r0  U  s    

z"MultiTemplateBuffer.choice_timings)callerc                 c   sL   t |tjjjst| j|jks$t| j}| | _z
d V  W 5 || _X d S ra   )	rH   rM   rN   select_algorithmTritonTemplateCallerrX   r  r  r.  )r   r3  renderr^   r^   r_   swap_as_triton_caller[  s    

z)MultiTemplateBuffer.swap_as_triton_callerc                 C   sJ   t |tjjjst| jj|jjks(t| jj|jjks<t|	 | _
d S ra   )rH   rM   rN   r4  r5  rX   r  r   r   r.  r  )r   r3  r^   r^   r_   finalize_as_triton_callerg  s    z-MultiTemplateBuffer.finalize_as_triton_callerc                 C   s    t | j| jjd}|| j| fS )Nru  )rD  r0  r  )r   Z
min_choicer^   r^   r_   get_min_choicem  s    z"MultiTemplateBuffer.get_min_choice)r   r   r   r  rw  r	   r   r   r   r#  r*  r  r8  r0  r   r   r-  r7  r8  r   r9  r&  r^   r^   r  r_   r/  A  s   
r/  c                       s,   e Zd Zedd fddZdd Z  ZS )CUDATemplateBufferZCUDATemplate)workspace_sizetemplatec                    s    t  ||| || _|| _d S ra   )r  r  r;  r<  )r   r  r  r  r;  r<  r  r^   r_   r  s  s    zCUDATemplateBuffer.__init__c                 C   s   | j d k	r| j S dS r   )r;  r   r^   r^   r_   r    s    z%CUDATemplateBuffer.get_workspace_size)r   r   r   r   r  r  r&  r^   r^   r  r_   r:  r  s   r:  c                       s   e Zd Z fddZ  ZS )CppTemplateBufferc                    s    t  ||| || _|| _d S ra   )r  r  r<  choice)r   r  r  r  r<  r>  r  r^   r_   r    s    zCppTemplateBuffer.__init__r   r   r   r  r&  r^   r^   r  r_   r=    s   r=  c                   @   sJ   e Zd ZU ee ed< dd Zdd Zedd Z	e
dd	 Zd
d ZdS )r  r  c                 C   s   t | S ra   r  r6  r^   r^   r_   get_read_writes_input  s    z"InputsKernel.get_read_writes_inputc                    sp   g } j D ]8}t|tr2| fdd|D  q
| | q
tjt|t	 
 ht g d t dS )Nc                    s   g | ]}  |qS r^   )r@  r  r   r^   r_   rp     s     z0InputsKernel.get_read_writes.<locals>.<listcomp>)Z	op_counts)r  rH   rI   extendr  r@  r+   Z
ReadWritesr   r  r   collectionsCounter)r   Zstar_depinputr^   r   r_   r    s    

zInputsKernel.get_read_writesc                 C   sz   t |tr|j}t |tr |j}t |tr>t |ts>t|}t |trR| |S t |t	r`|S t |t
tfsvt||S ra   )rH   rR   r  r  r&  r#  rh  ri  unwrap_storage_for_inputTorchBindObjectr   rX   r  r   r^   r^   r_   rE    s    





z%InputsKernel.unwrap_storage_for_inputc                 C   s@   g }| D ]2}t |tr&dd |D }n
t|}|| q|S )Nc                 S   s   g | ]}t |qS r^   )r  rE  rk   r^   r^   r_   rp     s     z/InputsKernel.unwrap_storage.<locals>.<listcomp>)rH   rI   r  rE  r  )r  Z
inputs_newr   r^   r^   r_   r    s    

zInputsKernel.unwrap_storagec                 C   s   dS r  r^   r   r^   r^   r_   r    s    zInputsKernel.is_externN)r   r   r   r	   r   r   r@  r  r$  rE  r   r  r  r^   r^   r^   r_   r    s   



r  c                   @   s   e Zd Zdd ZdS )	NopKernelc                 C   s   dS r  r^   r   r^   r^   r_   r    s    zNopKernel.is_no_opN)r   r   r   r  r^   r^   r^   r_   rH    s   rH  c                   @   s<   e Zd ZdZedd Zedd Zedd Zdd	 Zd
S )ConcatKernelzn
    There isn't actually a real kernel for concat, we just change the
    storage for the upstream data.
    c                 C   s  |d   }|d  }t|d  }dg}|| g}d|  krPt|k sVn ttdt|D ]}||  }	|||  t|	t|kst||  |kst||   |ksttt|D ]>}
|
|kr||
 |	|
  ||
< qtj	j
||
 |	|
 ||
< q|||  qdt|}tt|D ]L}|| }t|r0| }t|tr0t|j|jr0t|} q~q0tdd |D }tj	jjd }t|tst|dkrtdd |D rt|}td t||||dg d}t|}g }tt|D ]}| || tj|||| || dd	}|j| t|| j t!r`|| j " }n
|| j }|# rt$||   j%rt&|s||'  qt|dkrtj	(| tj	)||_*| +|j|_|S )
Nr   r)   c                 s   s   | ]}t |V  qd S ra   )r   r  r^   r^   r_   r     s     z&ConcatKernel.create.<locals>.<genexpr>Fc                 s   s@   | ]8}d |j ko6|j d  jtjdp6|j d  jtjdV  qdS )r  r  N)r  r  rM   r  r  rl   argr^   r^   r_   r     s   
)r   r   r   r   rg   r  r  )r  ),r   r   rI   r   rr   rX   rs   r  rD   r   r   rl  r   r   r   r   rH   r  rw  r  r   r   r   rB  r!  rd   rI  r  r  r  r  r  r  r&  r7  is_input_bufferr<   rY   r;   r   Zregister_listr  rg   r  )r  r  rO  r   r   r;  Zoffsets_startZoffsets_endrm   
input_sizerQ  output_strider   r  Zany_input_is_storage_and_layoutZfx_node_argsZconcat_kernelkernelZbuffer_namesZinput_bufferZinput_unwrappedr^   r^   r_   r    s    
 


 


    
zConcatKernel.createc                 C   s2   t |tr| |jS t |jjto0t |jt S ra   )rH   rR   can_realize_into_without_copyr  r  r   ExternKernelAlloc)r  r  r^   r^   r_   rR  *  s    
 z*ConcatKernel.can_realize_into_without_copyc              	   C   s   t |ts(t|r(t|\}}t||}t |ts:t|t |trR| |j|S t |tr|	  t
|jdstt| |rt||j_|jS tj| | | dd t| | D d}| ||S )Nr  c                 S   s    g | ]\}}t jj||qS r^   r  r  r^   r^   r_   rp   M  s   z-ConcatKernel.realize_into.<locals>.<listcomp>r  )rH   r#  r   r  rX   rR   r  r  r  r   r  rR  r  r  r,  r  r   r   r   rw   r   )r  r  r  r@  r  pwr^   r^   r_   r  4  s,    




	zConcatKernel.realize_intoc                 C   s   dS r  r^   r   r^   r^   r_   r  T  s    zConcatKernel.should_allocateN)	r   r   r   r  r$  r  rR  r  r  r^   r^   r^   r_   rI    s   
Y
	
rI  c                 C   sP   t | tjjr| jdkrd S | jdkr6| jdd n| jdd}d| dS )Natenr  .r   r   z
at::_ops::z::call)	rH   rM   _ops
OpOverload	namespaceZ_overloadnamer   r  replace)rQ  opnamer^   r^   r_   get_aten_cpp_kernel_nameX  s    r\  c                
       sb  e Zd ZU dZeedf ed< eje	dZ
eeef ed< dZee ed< dZee ed< dZee ed	< ejedZee ed
< dZeeejjejjf  ed< dZeeeeef   ed< dZeeeeeef f  ed< eje	dZeej e!j"f ed< dG fdd	Z#e$ej  dddZ%dd Z&dHddZ'dd Z(dd Z)dd Z*dd  Z+e,d!d" Z-e.eeee ee e/eegef eeej e!j"f  f dd#d$Z0e.d%d& Z1e.d'd( Z2e.d)d* Z3e.dId+d,Z4e.d-d. Z5e.d/d0 Z6e.d1d2 Z7d3d4 Z8d5d6 Z9d7d8 Z:d9d: Z;dJd;d<Z<d=d> Z=d?d@ Z>dAdB Z?e$ej  ddCdDZ@dEdF ZAeAZB  ZCS )Krh  r^   .constant_args)default_factoryre   Noutput_viewpython_kernel_namecpp_kernel_nameordered_kwargs_for_cpp_kernelop_overloadarg_propertieskwarg_propertiesunbacked_bindingsc                    sf   t  ||| || _|r|ni | _|| _|| _|p:t|
| _|	| _|
| _	| 
  i | _tjj| _d S ra   )r  r  r]  re   r_  r`  r\  ra  rb  rc  collect_arg_kwarg_propertiesrf  rD   r   r!  Zfx_node)r   rg   r  r  r]  re   r_  r`  ra  rb  rc  r  r^   r_   r  |  s    zExternKernel.__init__r   c                 C   s   t  S ra   r  r   r^   r^   r_   r    s    z%ExternKernel.get_unbacked_symbol_defsc                 C   s   t | jtjjr$dd | jjjD ndd tt| j	D | _
t | jtjjrbdd | jjjD ni | _t | jtjjr| jsdd | jjjD | _d S )Nc                 S   s$   g | ]}|j s|j|j|jd qS ))rg   rY   r  )
kwarg_onlyrg   	real_typer  r  r^   r^   r_   rp     s   z=ExternKernel.collect_arg_kwarg_properties.<locals>.<listcomp>c                 S   s   g | ]}i qS r^   r^   rk   r^   r^   r_   rp     s     c                 S   s   i | ]}|j |j|jd qS ))rY   r  )rg   ri  r  r  r^   r^   r_   r     s    z=ExternKernel.collect_arg_kwarg_properties.<locals>.<dictcomp>c                 S   s   g | ]}|j r|jqS r^   rh  rg   r  r^   r^   r_   rp     s     )rH   rc  rM   rW  rX  _schema	argumentsrs   rr   r  rd  allarg_propertiesrb  r   r^   r^   r_   rg    s&    
z)ExternKernel.collect_arg_kwarg_propertiesFc                 C   s   t |ttfstt |tr$t|}| js2tdt|}t| j}||k rtd| j||  t	||D ]6}| j| d }|
||kr|| n| j| d  qj|S )Nz/ExternKernel.arg_properties should not be emptyzv%s has %d unprovided positional arguments. Will check if they are in the keyword arguments or will use default values.rg   r  )rH   rI   rJ   rX   rd  rr   r  r  rc  rs   r  )r   rd   re   Zconvert_val_to_strZn_argsZ
n_pos_argsrm   arg_namer^   r^   r_   fill_non_provided_args  s(    	


z#ExternKernel.fill_non_provided_argsc                 C   s    t | jtr|   |   d S ra   )rH   r  r   apply_constraintr!  r   r^   r^   r_   r    s    zExternKernel.decide_layoutc                 C   s    t | |\}}|r|| d S ra   )r:   	writeline)r   wrapperZ
origin_strZdetailed_origin_strr^   r^   r_   codegen_comment  s    zExternKernel.codegen_commentc                 C   s   t d S ra   r(  r   rr  r^   r^   r_   codegen  s    zExternKernel.codegenc                 C   s*   t jjr$tjrt jj| jq(| jS | jS ra   )	rD   r   cpp_wrapperr*   abi_compatibler  Zget_c_shim_func_namera  r`  r   r^   r^   r_   get_kernel_name  s    zExternKernel.get_kernel_namec                 C   s:   t j|  |  |  |  |  |  d}|  |S )N)r   r   r   r   r  r   )	r,  r  r   r   r   r   r  r   r   )r   rT  r^   r^   r_   
copy_input  s    zExternKernel.copy_inputc                    s  ||d}t |\} g g }g }|D ]R}t|t d rP|| q(t|tjrptjj	j
j|d d}|| q( fdd}	fdd|D }|D ]}
t|
rt|
dd	 qg }|D ]h}
|
 tjjkr|tjj|
   q|
 tjjkr|tjj|
   q|t|
dd
 q|	||\}}|||}d }tjj
 }rzt|tj| t||tjjd}t|ttfs|gn|}|D ]N}t|tjr|jrd}tjjjdd  }r| d| }|tj_q||||	|fS )Nrc   r  )r  c                    sd   g }t | }t |}D ]&}|r0|t| q|t| qt| }|dg |di fS )Nrd   re   )iterr  nextpytreeZtree_unflattenr  )Znew_tensor_argsZnew_non_tensor_argsr  Z
it_tensorsZit_non_tensorsZ	is_tensorr  )	args_specis_arg_tensorr^   r_   unflatten_args   s    z3ExternKernel.process_kernel.<locals>.unflatten_argsc                    s   g | ]}  |qS r^   ri  r  r  r^   r_   rp   ,  s     z/ExternKernel.process_kernel.<locals>.<listcomp>Tr  r   r  zEsparsity not handled. Please file issue for sparse inference weights.stack_tracez Found from : 
 )r|  Ztree_flattenr  rH   r   rS   r   rD   r   r   ra  Zcreate_symintnoder   r  r   	constantsZtorchbind_constantsr   	fake_moder"   r!  r   r  r  rI   rJ   rM   TensorZ	is_sparseZdisable_cudagraphs_reason)r  rQ  rd   re   Zbinded_argsZ	args_flattensor_argsnon_tensor_argsrL  r  r   Zexample_argsnew_argsZ
new_kwargsexample_outputrf  ra  Zexample_out_lir   msgr  r^   )r}  r  r~  r_   process_kernel  sd    


  
zExternKernel.process_kernelc              	   C   sH  t |tstt |tr|S | }tj| 	 }|dk	rd|j
krt |jtr|j
d jtjds~|j
d jtjdr|t|  n|  tj| dd\}}|d }| |}tjj||}tjj||}tjj||}	t|||	 }
||
kr td||	| tt|j t!|" |# | ||	dd	S )
z
        In order to pass this to an extern kernel we need a
        ReinterpretView not a View.  This allows us to avoid some
        unneeded copies.
        Nr  rJ  r  rF   r   z@convert_to_reinterpret_view failed: stride=%s offset=%s index=%sr  )r  r  )$rH   r&  rX   r#  r7  rD   r   r  r   r  r  r  r   r  rM   r  r  r  r   r   r!  r+   r  r   r   r  Zstride_varsZ
offset_varr>   r  r  r   r  r  r   r   )r  r   Zx_unwrap_viewZx_unwrap_view_fx_nodeZ
index_argsr  r  rn   r  r?  expectedr^   r^   r_   convert_to_reinterpret_viewb  sj    




 

z(ExternKernel.convert_to_reinterpret_viewc                 C   s  |d krt  S t|tjtjjjtfr.t|S t|t	rZt
jtj|j| | dS t|trh|S t|tr~| |jS t|trt| |j| S t|tr|  t| rz| |W S  tk
r   Y nX t|tr|  |S t|tr|S |  |S )N)r   r   )!r  rH   rS   r   rT   rU   rV   r   r  r  rD   r   Zadd_tensor_constantrM   tensorr   r   r   r7  rR   ri  r  r#  r   r&  r   r   r7  r  r   r  rF  ry  rG  r^   r^   r_   ri    s6    





zExternKernel.realize_inputc                 C   sD   t |r:t| dkr|S | D ]}|dkr$|  S q$| |S r>  )r   rr   r   ry  )r  r   r   r^   r^   r_   require_stride1  s    
zExternKernel.require_stride1c              	   C   s  |  dkr|S t|rt| tr2| j}qt| trxt|ddt||rjt	t
jj| jn||d |S t| tr| |r|S t| trt|  trtdn(t|  tr|  |r|S t|tr
| |r
|S t|trt|jtrt|jtst| rt| jtsz | |j|_| j|||dW S  tk
r   Y nX | |}t|dd||d t||st|S )Nr   TFr  zHthe MutationLayoutSHOULDREMOVE's real layout shouldn't be FlexibleLayoutr  )r   r   rH   r   r  r  r   r  r%  r   rD   r   r   Z
size_hintsr   r  r$  r  r  rX   r  rR   r  r&  r#  r7  rS  r  require_stride_orderr   ry  )r  r   ry   r  r^   r^   r_   r    s     
 



z!ExternKernel.require_stride_orderc                 C   s   |  |tS ra   )r  r  rG  r^   r^   r_   require_channels_last  s    z"ExternKernel.require_channels_lastc                 C   s   |  |tS ra   )r  r  rG  r^   r^   r_   require_channels_last_3d  s    z%ExternKernel.require_channels_last_3dc              	   C   s    |  |tttt| S ra   )r  rI   r  rs   rr   r   rG  r^   r^   r_   require_contiguous  s    zExternKernel.require_contiguousc                 C   s   d S ra   r^   r   r^   r^   r_   rp  #  s    zExternKernel.apply_constraintc                 C   s   t jjrpg }t| jD ]T\}}t| j| }| jrP|t| jk rP| j| dnd }|	t jj
|| q|S tt jj
j| jS d S )NrY   )rD   r   rv  r   r]  rr   r  rd  r  r  r  val_to_arg_strr   )r   r  rm   r   r   type_r^   r^   r_   codegen_const_args&  s    zExternKernel.codegen_const_argsc                 C   s   g }t | jD ]\}}t|trLdd |D }dd| d}|| qtjjr| j	rh|t
| j	k sptd| j	| d}|tjj|| q||  q||   |S )Nc                 S   s   g | ]}|  qS r^   r   rk   r^   r^   r_   rp   ;  s     z-ExternKernel.codegen_args.<locals>.<listcomp>[r]  ]z-Invalid access to ExternKernel.arg_propertiesrY   )r   r  rH   rI   r   r  rD   r   rv  rd  rr   rX   r  r  r  r   rA  r  )r   rd   rm   r   r  r   r  r^   r^   r_   codegen_args7  s,    

 zExternKernel.codegen_argsc                 C   sL   || j kr| j |S | jr:| j|r:| j|dS t| dd S )Nr  z not in self.allarg_properties)re   r  rm  rX   )r   rn  r^   r^   r_   get_kwargs_valueN  s
    
zExternKernel.get_kwargs_valuec                 C   s   t jjrg }| jD ]p}|r$|dkr$q| |}t|tjrF|| q| j	rh|| j	krh| j	
|
dnd }|t jj|| qndd | j D }|S )Nr   rY   c                 S   s(   g | ] \}}| d t jj| qS r   rD   r   r  r  rl   kr  r^   r^   r_   rp   m  s   z/ExternKernel.codegen_kwargs.<locals>.<listcomp>)rD   r   rv  rb  r  rH   rS   r   r  rm  r  r  r  re   r  )r   skip_outre   rn  r  r  r^   r^   r_   codegen_kwargsV  s0    

 zExternKernel.codegen_kwargsc              	   C   sl   t jrhtjjsht|  dkr"d S tjj|  }tjj| 	 }|
d|   d| d| d d S )Nr   zassert_size_stride(r]  r  )r*   Zsize_assertsrD   r   rv  rA   r   r  Zcodegen_shape_tupler   rq  r   )r   rr  r   r   r^   r^   r_   codegen_size_assertss  s    z!ExternKernel.codegen_size_assertsc                 C   s   |   }|  }|g g|fS )zD
        get output sizes and strides, for template_codegen
        )r   r   )r   _sizer  r^   r^   r_   get_group_stride~  s    zExternKernel.get_group_stridec                    s  t jj|  }|  }fdd|D }dd tt|D ttt||jdd}dd t	|D fddttD }fd	d|D | 
 }|}t jj||g\}}}	td
\}
 tt| fdd|D }tt||}|t|fS )zC
        Manually get canonicalization of the output index
        c                    s   g | ]}  |qS r^   )r   r  r  r^   r_   rp     s     z-ExternKernel.canonicalize.<locals>.<listcomp>c                 S   s   g | ]}t d | qS )d)r?   rk   r^   r^   r_   rp     s     T)rv  rs  c                 S   s   i | ]\}}||qS r^   r^   r   r^   r^   r_   r     s      z-ExternKernel.canonicalize.<locals>.<dictcomp>c                    s   g | ]} | qS r^   r^   rk   r   r^   r_   rp     s     c                    s   g | ]} | qS r^   r^   rk   )r  r^   r_   rp     s     cc                    s   g | ]} |qS r^   r^   r  )add_varr^   r_   rp     s     )rD   r   r   r   r   rs   rr   r  r  r   r   r  r0   rK   rw   rB   rS   r  rJ   )r   r  r  Zindex_orderry   r0  rn   Z	new_sizesrv   r  r   replacementr^   )r  r  r   r   r_   canonicalize  s(      
 zExternKernel.canonicalizec                 C   s>   t  }| jD ]}|t|O }q| j D ]}|t|O }q(|S ra   )r   r]  maybe_free_unbacked_symbolsre   rL   )r   r  rL  r^   r^   r_   r     s    
z%ExternKernel.get_unbacked_symbol_usesc                    sP   t  dd }d|g}| fddt D 7 }|d j  |S )Nr`  zpython_kernel_name=c                    s$   g | ]}|j  d t |j  qS r   )rg   rb   )rl   fieldr   r^   r_   rp     s   z(ExternKernel.__str__.<locals>.<listcomp>r  )rb   dataclassesfieldsr  r  r   )r   Zkernel_namer   r^   r   r_   r    s    zExternKernel.__str__)r^   NNNNr^   N)F)F)F)Dr   r   r   r]  r   r   r   r  r  rK   re   r   ri   r_  r
   r#  r`  ra  rI   rb  r   rc  r   rM   rW  rX  HigherOrderOperatorrd  r	   re  rf  rS   r   r|  ZKeyPathr  r   r  rg  ro  r  rs  ru  rx  r   ry  r$  r   r  r  ri  r  r  r  r  r  rp  r  r  r  r  r  r  r  r   r  r#  r&  r^   r^   r  r_   rh  g  s   
 	       "
"
[
C

	G



	
rh  c                       s.   e Zd Zdd Zd	 fdd	Zdd Z  ZS )
ExternKernelOutc                 C   sJ   |  | |  | jdd}||  |  | jr>| j nd | d S )NT)r  )rs  r  r  Zgenerate_extern_kernel_outrx  r   r_  r   rr  rd   r^   r^   r_   ru    s    
zExternKernelOut.codegenr^   Nc
           
         s:   t  d || |||pi d ||||	
 tj| | _d S ra   r  r  r  rD   r   r  rg   )
r   r  r  r]  re   r_  r`  ra  rb  rc  r  r^   r_   r    s    zExternKernelOut.__init__c                 C   s   dS r  r^   r   r^   r^   r_   r    s    zExternKernelOut.should_allocate)r^   NNNNr^   N)r   r   r   ru  r  r  r&  r^   r^   r  r_   r    s          r  c                       s&   e Zd Zeejd fddZ  ZS )RandomSeeds)countr   c                    sP   t t j}t jt|t j|gdg |j|j|ggdtj	r>dndt
jjd d S )Nr  zaten.randint.low_outzat::_ops::randint_low_out::callzat::randint_out)r  r  r]  r`  ra  rc  )rM   r  r  r  r  r  rD  rC  r*   rw  rU  randintZlow_out)r   r  r   Zlimitsr  r^   r_   r    s    zRandomSeeds.__init__)r   r   r   r   rM   r   r  r&  r^   r^   r  r_   r    s   r  c                       s6   e Zd Zdd Zd fdd	Zdd Zd	d
 Z  ZS )rS  c                 C   sD   |  | |  |  }tjj| | t| jt	r@| 
| d S ra   )rs  r  r  rD   r   r  Zgenerate_extern_kernel_allocrH   r  rw  r  r  r^   r^   r_   ru     s
    
zExternKernelAlloc.codegenr^   Nc	           	         s:   t  d || |||pi d ||||
 tj| | _d S ra   r  )	r   r  r  r]  re   r`  ra  rb  rc  r  r^   r_   r    s    zExternKernelAlloc.__init__c                 C   s   dS r  r^   r   r^   r^   r_   r     s    z!ExternKernelAlloc.should_allocatec                 C   s   t d S ra   r(  r   r^   r^   r_   rp  #  s    z"ExternKernelAlloc.apply_constraint)r^   NNNr^   N)r   r   r   ru  r  r  rp  r&  r^   r^   r  r_   rS    s         rS  c                       sx   e Zd Zdd Zdd Zdd Zdd Zeej	 d	 fd
dZ
eej	 d	ddZdd Z fddZdd Z  ZS )UserDefinedTritonKernelc                 C   sF   ddl m} ddlm} || j}g }t||r>|j}|j}||fS )Nr   )	Autotuner)kernel_side_table)	Ztriton.runtime.autotunerr  *torch._higher_order_ops.triton_kernel_wrapr  Z
get_kernel
kernel_idxrH   configsrh   )r   r  r  rQ  r  r^   r^   r_   get_kernel_and_configs(  s    
z.UserDefinedTritonKernel.get_kernel_and_configsc           	         s   |   \ }| || j\}}|  }g }tjjr fddt|D }| jD ].}| 	|}|
t|drt| nt| qP fddt|D }| | ||| j|||| d S )Nc                    s   g | ]\}}| j kr|qS r^   Z
constexprs)rl   rm   rL  rQ  r^   r_   rp   B  s     
 z3UserDefinedTritonKernel.codegen.<locals>.<listcomp>r   c                    s   g | ]\}}| j kr|qS r^   r  )rl   rm   r   r  r^   r_   rp   I  s    
 )r  Z!define_user_defined_triton_kernelre   r  rD   r   rv  r   rb  r  r  r  r   rY   rs  Z#generate_user_defined_triton_kernelgrid)	r   rr  r  new_nameZtriton_metard   Z	arg_typesrn  r  r^   r  r_   ru  4  s6      



     zUserDefinedTritonKernel.codegenc                 C   s   dS r  r^   r   r^   r^   r_   r  S  s    z'UserDefinedTritonKernel.should_allocatec                 C   s   dS r  r^   r   r^   r^   r_   has_side_effectsV  s    z(UserDefinedTritonKernel.has_side_effectsr   c                    s   t   t| jB S ra   )r  r   r!   r  r   r  r^   r_   r   [  s    z0UserDefinedTritonKernel.get_unbacked_symbol_usesc                 C   s   t  S ra   r  r   r^   r^   r_   r  `  s    z0UserDefinedTritonKernel.get_unbacked_symbol_defsc                 C   s   g S ra   r^   r   r^   r^   r_   r  c  s    z*UserDefinedTritonKernel.get_mutation_namesc                   s,  g }t  }g }  D ]H\}}t|trLt| |}	||	 |	||< q|| |||< qt|dkspt	|d 
 }
t d t|
|t|| tj| | _|| _|| _|  \}} fdd|jD | _ddlm} t|dkr|d jni } fdd|| |D | _t| f| j  d S )Nr   c                    s   g | ]}| kr|qS r^   r^   rK  kernel_argsr^   r_   rp     s     z4UserDefinedTritonKernel.__init__.<locals>.<listcomp>)identify_mutated_tensorsc                    s   g | ]} | qS r^   r^   rl   rv  r  r^   r_   rp     s   )rK   r  rH   rR   r  rE  ri  r  rr   rX   r   r  r  r  rJ   rD   r   r  rg   r  r  r  	arg_namesrb  r  r  re   mutable_argsr"  )r   r  r  r  r  re   r]  r  r  r   r   rQ  r  r  Zautotuned_kwargsr  r  r_   r  i  sF    






 z UserDefinedTritonKernel.__init__c                 C   s   dd | j D S )Nc                 S   s   g | ]}|  qS r^   r  rk   r^   r^   r_   rp     s     zHUserDefinedTritonKernel.get_inputs_that_alias_output.<locals>.<listcomp>)r  r   r^   r^   r_   r    s    z4UserDefinedTritonKernel.get_inputs_that_alias_output)r   r   r   r  ru  r  r  r   rS   r   r   r  r  r  r  r&  r^   r^   r  r_   r  '  s   ,r  )mutated_nodesc                 G   sP   |D ]F}t |ts*t| dt| dtj|  t|	 ||  qdS )z
    Allows ops in mutated_nodes to be marked as being mutated as well as
    indicates to the scheduler that these ops depend on cur_buffer.

    NB: Use this instead of directly constructing MutationOutput
    z node is type z and is not an IRNodeN)
rH   r   rX   rY   rD   r   r  r   MutationOutputr   )Z
cur_bufferr  r[   r^   r^   r_   r"    s     r"  c                       sD   e Zd Zdd Z fddZdd Zdd Zd	d
 Zdd Z  Z	S )r  c                 C   s   | j d  gS r   r  r   r   r^   r^   r_   r    s    z!MutationOutput.get_mutation_namesc                    s.   t  d |||gd || _tj| | _d S r  )r  r  node_doing_mutatingrD   r   r  rg   )r   r  Zmutated_noder  r  r^   r_   r    s    zMutationOutput.__init__c                 C   s   dS r  r^   r   r^   r^   r_   r    s    zMutationOutput.should_allocatec                 C   s   dS r  r^   r   r^   r^   r_   r    s    zMutationOutput.is_no_opc                 C   s   dS r  r^   r   r^   r^   r_   r    s    zMutationOutput.has_side_effectsc                 C   s   | j d  gS r   r  r   r^   r^   r_   r    s    z+MutationOutput.get_inputs_that_alias_output)
r   r   r   r  r  r  r  r  r  r&  r^   r^   r  r_   r    s   r  c                       sL   e Zd ZdZdd Zdd Zdd Zeej	 dd	d
Z
 fddZ  ZS )InplaceBernoulliFallbackE
    This needs to be a custom class to handle mutation properly
    c                 C   s   dd | j D \}tjjrVtjrV||   d| ddt	t
| j d|j  n4||   d| ddt	t
| j d|j  d S )Nc                 s   s   | ]}|  V  qd S ra   r  rl   r   r^   r^   r_   r     s     z3InplaceBernoulliFallback.codegen.<locals>.<genexpr>(r]  z, NULL)r  )r  rD   r   rv  r*   rw  rq  rx  r   r   reprr]  ending)r   rr  r   r^   r^   r_   ru    s    ,,z InplaceBernoulliFallback.codegenc                 C   s   dS r  r^   r   r^   r^   r_   r    s    z(InplaceBernoulliFallback.should_allocatec                 C   s   | j d  gS r   r  r   r^   r^   r_   r    s    z+InplaceBernoulliFallback.get_mutation_namesr   c                 C   s   t  S ra   r  r   r^   r^   r_   r    s    z1InplaceBernoulliFallback.get_unbacked_symbol_defsc                    sT   t  jd t| | |g||d tj| | _d| _	t
jsFd| _t| | d S )Nrc  zaten.bernoulli_zat::native::bernoulli_)r  r  r  r   r  rD   r   r  rg   r`  r*   rw  ra  r"  )r   rc  r   r]  r  r^   r_   r    s    

z!InplaceBernoulliFallback.__init__r   r   r   r  ru  r  r  r   rS   r   r  r  r&  r^   r^   r  r_   r    s   r  c                       s`   e Zd ZdZdd Zdd Zdd Zeej	 dd	d
Z
 fddZededddZ  ZS )InplaceCopyFallbackr  c                 C   s>   |   \}}}||   d| d| d| d|j 	 d S )Nr  r]  r  )r  rq  rx  r  )r   rr  r  r  non_blockingr^   r^   r_   ru    s    $zInplaceCopyFallback.codegenc                 C   s   dS r  r^   r   r^   r^   r_   r    s    z#InplaceCopyFallback.should_allocatec                 C   s   | j d  gS r   r  r   r^   r^   r_   r    s    z&InplaceCopyFallback.get_mutation_namesr   c                 C   s   t  S ra   r  r   r^   r^   r_   r    s    z,InplaceCopyFallback.get_unbacked_symbol_defsc                    s4   t  jd |||dtjrdndd tj| | _d S )Nz
aten.copy_Zaoti_torch_copy_zat::_ops::copy_::callr`  ra  )r  r  r*   rw  rD   r   r  rg   r   r  r  r]  r  r^   r_   r     s    
zInplaceCopyFallback.__init__F)r  c                    s>    fdd||fD }|f}t t| ||}t|| |S )Nc                    s   g | ]}  |qS r^   r  r  r  r^   r_   rp     s     z.InplaceCopyFallback.create.<locals>.<listcomp>)r  r  r   r"  )r  r  r  r  r  r]  r  r^   r  r_   r    s    

zInplaceCopyFallback.create)F)r   r   r   r  ru  r  r  r   rS   r   r  r  r$  r   r  r&  r^   r^   r  r_   r    s   r  c                   @   sD   e Zd ZdZdd Zdd Zdd Zeej	 dd	d
Z
dd ZdS )MutatingFirstArgExternKernelr  c                 C   sF   dd | j D tt| j}||   dd| d|j  d S )Nc                 s   s   | ]}|  V  qd S ra   r  r  r^   r^   r_   r   &  s     z7MutatingFirstArgExternKernel.codegen.<locals>.<genexpr>r  r]  r  )r  r   r  r]  rq  rx  r   r  )r   rr  Zargrefsr^   r^   r_   ru  $  s    
z$MutatingFirstArgExternKernel.codegenc                 C   s   dS r  r^   r   r^   r^   r_   r  -  s    z,MutatingFirstArgExternKernel.should_allocatec                 C   s   | j d  gS r   r  r   r^   r^   r_   r  0  s    z/MutatingFirstArgExternKernel.get_mutation_namesr   c                 C   s   t  S ra   r  r   r^   r^   r_   r  3  s    z5MutatingFirstArgExternKernel.get_unbacked_symbol_defsc                 C   s   dS r  r^   r   r^   r^   r_   r  6  s    z-MutatingFirstArgExternKernel.has_side_effectsN)r   r   r   r  ru  r  r  r   rS   r   r  r  r^   r^   r^   r_   r    s   	r  c                       s   e Zd Z fddZ  ZS )ResizeStorageBytesc                    s   t |tstdt jd t| | |g|fd tj	
|  tj	| | _d| _d| _tj	j|j  t| | d S )NzTODO: dynamic shapes)r]  z"inductor_ops.resize_storage_bytes_z&torch::inductor::resize_storage_bytes_)rH   r   rX   r  r  r  r   r  rD   r   r  r   r  rg   r`  ra  never_reuse_buffersrA  r  r"  )r   variabler;  r  r^   r_   r  ;  s    

zResizeStorageBytes.__init__r?  r^   r^   r  r_   r  :  s   r  c                       s4   e Zd Z fddZdd Zdd Zdd Z  ZS )	SetSourceTensorKernelc                    sj   |   t j| ||gdd tjj|j	  tjj|	  tjj| 	  t
| || d S )Nz!torch.ops.aten.set_.source_Tensor)r`  )r!  r  r  r   rD   r   r  rA  r  r   r"  )r   Zself_tensorZstorage_tensorr  r^   r_   r  L  s    zSetSourceTensorKernel.__init__c                 C   s   | j d  | j d  gS r>  r  r   r^   r^   r_   r  X  s    z2SetSourceTensorKernel.get_inputs_that_alias_outputc                 C   s   | j d  gS r  r  r   r^   r^   r_   r  [  s    z(SetSourceTensorKernel.get_mutation_namesc                 C   s   dS r  r^   r   r^   r^   r_   r  ^  s    z&SetSourceTensorKernel.has_side_effects)r   r   r   r  r  r  r  r&  r^   r^   r  r_   r  K  s   r  c                       sb   e Zd ZdZdd Zdd Zdd Zeej	 dd	d
Z
dddeee ed fddZ  ZS )ScatterFallbackz
    This needs to be a custom class to handle mutation properly.
    This class handles both aten.scatter_ and aten.scatter_reduce_.
    It also handle the case `src` being a scalar properly.
    c              
   C   s   | j d }tjjr,ddd}||kr,|| }| jrJdd | jD \}}}ndd | jD \}}| jd }|||| jd	 ||g| j| j	| j|| 
  d S )
Nr  rF  rE  )rA  multiplyc                 s   s   | ]}|  V  qd S ra   r  r  r^   r^   r_   r   r  s     z*ScatterFallback.codegen.<locals>.<genexpr>c                 s   s   | ]}|  V  qd S ra   r  r  r^   r^   r_   r   t  s     r)   r   )re   rD   r   rv  src_is_tensorr  r]  Zgenerate_scatter_fallbackra  r`  r  )r   rr  r  Zget_operator_enumr   rn   r  r^   r^   r_   ru  i  s$    


zScatterFallback.codegenc                 C   s   dS r  r^   r   r^   r^   r_   r    s    zScatterFallback.should_allocatec                 C   s   | j d  gS r   r  r   r^   r^   r_   r    s    z"ScatterFallback.get_mutation_namesr   c                 C   s   t  S ra   r  r   r^   r^   r_   r    s    z(ScatterFallback.get_unbacked_symbol_defsNTr  include_self)rO  r  r  c          
   
      s   t |t _ jr2 fdd|||fD }|f}	n fdd||fD }||f}	t jd t|  ||	||dt|ddg|d t	| _
tj  _t | d S )Nc                    s   g | ]}  |qS r^   r  r  r   r^   r_   rp     s     z,ScatterFallback.__init__.<locals>.<listcomp>c                    s   g | ]}  |qS r^   r  r  r   r^   r_   rp     s     r  r  r  )r`  rb  rc  )rH   rR   r  r  r  r  r   r  ri   r\  ra  rD   r   r  rg   r"  )
r   rc  r   rO  rn   r  r  r  tensorsr]  r  r   r_   r    s&    


zScatterFallback.__init__)r   r   r   r  ru  r  r  r   rS   r   r  r   r
   ri   r   r  r&  r^   r^   r  r_   r  b  s   r  c                       sL   e Zd ZdZdd Zdd Zdd Zeej	 dd	d
Z
 fddZ  ZS )IndexPutFallbackzQ
    This needs to be a custom class to handle mutation and indices properly
    c           	      C   s   dd | j D ^}}}g }t|}t| jD ]6\}}| j| d k	rR|t| q,|tjjj	 q,|j
|  |||f|    d S )Nc                 s   s   | ]}|  V  qd S ra   r  r  r^   r^   r_   r     s     z+IndexPutFallback.codegen.<locals>.<genexpr>)r  rz  r   r  r  r{  rD   r   r  r  Zgenerate_index_put_fallbackrx  r  )	r   rr  r   rL   valid_indicesr  Ziter_valid_indicesrm   r   r^   r^   r_   ru    s       zIndexPutFallback.codegenc                 C   s   dS r  r^   r   r^   r^   r_   r    s    z IndexPutFallback.should_allocatec                 C   s   | j d  gS r   r  r   r^   r^   r_   r    s    z#IndexPutFallback.get_mutation_namesr   c                 C   s   t  S ra   r  r   r^   r^   r_   r    s    z)IndexPutFallback.get_unbacked_symbol_defsc           	   	      s   | _ dd |D } fdd||f|D }tjr8dnd}t jd t|  ||fd||d tj	
  _t | d S )Nc                 S   s   g | ]}|d k	r|qS ra   r^   rk   r^   r^   r_   rp     s      z-IndexPutFallback.__init__.<locals>.<listcomp>c                    s   g | ]}  |qS r^   r  r  r   r^   r_   rp     s     Zaoti_torch_index_put_outzat::index_put_outzaten.index_put_)r`  ra  rc  )r  r*   rw  r  r  r  r   r  rD   r   r  rg   r"  )	r   rc  r   r  rL   
accumulater  r  ra  r  r   r_   r    s     
	zIndexPutFallback.__init__r  r^   r^   r  r_   r    s   r  c                   @   s    e Zd Zedd Zdd ZdS )
DeviceCopyc                 C   sx   |  s0tdd | D r0tjjs0||S tj	| tj	|
  td tt|| | d| |gS )Nc                 s   s(   | ] }|j tjjkot|tjV  qd S ra   )rg   rD   r   r  rH   r+   r  r  r^   r^   r_   r     s   z$DeviceCopy.create.<locals>.<genexpr>zDeviceCopy in input programr  )r  r  r   r*   Zaot_inductorZuse_runtime_constant_foldingr"  rD   r   Zadd_device_infor   r9   r  r   r   r   ri  )r  r   r   r^   r^   r_   r    s(    

zDeviceCopy.createc                 C   sN   |   }t|dkst| jr6||d | j  n||d |   d S rE  )r  rr   rX   r_  Zcodegen_device_copyr   r  r^   r^   r_   ru    s
    zDeviceCopy.codegenN)r   r   r   r$  r  ru  r^   r^   r^   r_   r    s   
r  c                       sL   e Zd ZdZdd Zdd Z fddZeej	 dd	d
Z
dd Z  ZS )rP   z;
    The result of a call to aten._local_scalar_dense.
    c                 C   s   dS r  r^   r   r^   r^   r_   r     s    zDynamicScalar.get_readsc                 C   s   dS r  r^   r   r^   r^   r_   r    s    zDynamicScalar.should_allocatec                    s:   |   t d ttd| |g || _|| _d S r   )	r   r  r  r  rM   r   r  symkeypath)r   r  r  r  r  r^   r_   r    s    "zDynamicScalar.__init__r   c                 C   s   | j hS ra   )r  r   r^   r^   r_   r    s    z&DynamicScalar.get_unbacked_symbol_defsc                 C   s   | |  d S ra   )Zcodegen_dynamic_scalarrt  r^   r^   r_   ru    s    zDynamicScalar.codegen)r   r   r   r  r   r  r  r   rS   r   r  ru  r&  r^   r^   r  r_   rP      s   rP   c                       sH   e Zd ZdZdd Zdd Z fddZdd	 Zd
d Zdd Z	  Z
S )rQ   z5
    The result of a call to aten._assert_scalar
    c                 C   s   dS r  r^   r   r^   r^   r_   r     s    zAssertScalar.get_readsc                 C   s   dS r  r^   r   r^   r^   r_   r     s    zAssertScalar.should_allocatec                    s*   t  d ttdg  || _|| _d S r   )r  r  r  rM   r   scalarr  )r   r  r  r  r^   r_   r  #  s    zAssertScalar.__init__c                 C   s   dS r  r^   r   r^   r^   r_   r  .  s    zAssertScalar.has_side_effectsc                 C   s
   t | jS ra   )r!   r  r   r^   r^   r_   r   1  s    z%AssertScalar.get_unbacked_symbol_usesc                 C   s\   t jjr
nN|dt jjj| jdd d |dt| j d || 	  d d S )Nzif not F)r  :z    raise RuntimeError(r  z = None)
rD   r   rv  rq  r  Zcodegen_python_sizevarr  r  r  r   rt  r^   r^   r_   ru  4  s    	zAssertScalar.codegen)r   r   r   r  r   r  r  r  r   ru  r&  r^   r^   r  r_   rQ     s   rQ   c                   @   s    e Zd ZU eed< ejed< dS )ExternKernelNoderg   r[   N)r   r   r   ri   r   export_schemar   r^   r^   r^   r_   r  H  s   
r  c                       s   e Zd Zd!dd fddZdd Zeej ddd	Zd
d Z	dd Z
edd Zdd Zdd Zdd Zdd Zdd ZeejdddZedd Z fdd Z  ZS )"FallbackKernelNrf  c                   s  |t jjkr,t|dkr,t|dkr,t jj}t j|t|t||d g  _d _	| _
t|tjjtjjfstd| dt| d| _| _|d kri n| _tj j g  _g  _t jtjjrd S d j krd S  jj}tjj jr j |d !  d S |j"r:t#|s:t$d	| |j%}	  j& j'\}
} fd
d}tjj(||
|D ]\}}||| qrd S )Nr)   r  Fz#Fails to create FallbackKernel for r^  z not supportedZ_c10d_functionalr   z'NYI: Can't generate FallbackKernel for c                    s   t | jtjr t |ttfs tt | jtjo>t | j tj	}|sRt | jtj	rdt |ttfrdt|d krpd S | j
d kr~d S t | jtj	s|st j|  | j
jrt | d S ra   )rH   rY   rM   ListTyperI   rJ   rX   ZOptionalTypegetElementType
TensorType
alias_infoalias_namesr  r   is_writer"  )inforL  Zis_optional_tensorr   r^   r_   handle_aliasing_and_mutation  s$     
z=FallbackKernel.__init__.<locals>.handle_aliasing_and_mutation))rU  r@  r  rr   ZScalarr  r  rJ   outputsuse_runtime_dispatchrf  rH   rM   rW  rX  r  rX   rY   rc  r  re   rD   r   Zwarn_fallbackr`  r  mutation_namesrg   rk  Z_libraryutilsZmutates_and_returns_first_argr  r   
is_mutabler   r   rl  r  r]  Z
zip_schema)r   r  rQ  r  nontensor_argsr  re   rf  schemaZschema_argsrd   r  r  rL  r  r   r_   r  `  s^    



zFallbackKernel.__init__c                    s|   t dsd S ttjjjj}|s(d S | D ]F\} fdd  fdd}||	| d|  |j
  q0d S )Nrf  c                    s  |dkr| S t |dkrft|d trft|d tjrf |  d|d j d|d j d|dd  S t|d tr |  d|d j d|dd  S t|d tjr̈ |  d	|d j d
|dd  S t|d tr  |  d|d j d|dd  S t	d| d S )Nr^   r   r   r)   rV  r  r  z()r  r  z.__floordiv__(zunrecognized keypath )
rr   rH   r   r|  SequenceKeyrg   r   r    ru  rX   )r  r  )gor^   r_   r    s&    
  
$$$z7FallbackKernel.codegen_unbacked_symbol_defs.<locals>.goc                      sz   t jjrhtjrhtjdkr0 jd  S td t	j
sDt jd j  dd  S n  S d S rE  )rD   r   rv  r*   rw  rr   r  r   rH   r|  r  rX   r   r^   r  r  r   r^   r_   go_outer  s    $z=FallbackKernel.codegen_unbacked_symbol_defs.<locals>.go_outer = )r  r#   rD   r   r   ra  rf  r  rq  Zcodegen_unbacked_symbol_declr  )r   rr  rf  r   r   r^   r  r_   codegen_unbacked_symbol_defs  s    
 z+FallbackKernel.codegen_unbacked_symbol_defsr   c                 C   s.   t | dd  }r$ttjjj| S t S d S )Nrf  )rb   r#   rD   r   r   ra  r  r   )r   rf  r^   r^   r_   r    s     z'FallbackKernel.get_unbacked_symbol_defsc                    s   ddl m} |jjr&td|j ddd  t fdd|jjD sXt|j d	t fd
d|jjD st|j d|jj	| _
|jj| _| j
dd d| j | _||| _d S )Nr)   get_cpp_op_schemazmutable z" is not supported with cpp_wrapperc                 S   s   | j d kp| j j S ra   )r  r  )rL  r^   r^   r_   is_not_write  s    z3FallbackKernel.set_cpp_kernel.<locals>.is_not_writec                 3   s   | ]} |V  qd S ra   r^   r  r  r^   r_   r      s    z0FallbackKernel.set_cpp_kernel.<locals>.<genexpr>z< with alias_info arguments is not supported with cpp_wrapperc                 3   s   | ]} |V  qd S ra   r^   r  r  r^   r_   r   #  s    z: with alias_info returns is not supported with cpp_wrapper::r   )codegen.wrapperr  rk  r  rX   r   r  rl  returnsrg   ra  overload_namecpp_kernel_overload_namerZ  cpp_kernel_keycpp_op_schemar   rQ  r  r^   r  r_   set_cpp_kernel  s(    





zFallbackKernel.set_cpp_kernelc                    s   t jG dd d  fdd| jD }| || j\}}tjjrzt| j	t
jjrz| ||}dd t| j	jj|D }ndd |D }| j| |S )Nc                   @   s   e Zd ZU eed< dd ZdS )z)FallbackKernel.codegen_args.<locals>.Shimrefc                 S   s   | j S ra   )r  r   r^   r^   r_   r#  2  s    z2FallbackKernel.codegen_args.<locals>.Shim.__repr__N)r   r   r   r   r   r#  r^   r^   r^   r_   Shim.  s   
r  c                    s   g | ]} |  qS r^   r  r  r  r^   r_   rp   5  s     z/FallbackKernel.codegen_args.<locals>.<listcomp>c                 S   s"   g | ]\}}t jj||jqS r^   )rD   r   r  r  ri  )rl   paramr   r^   r^   r_   rp   9  s   c                 S   s   g | ]}t jj|qS r^   r  r  r^   r^   r_   rp   >  s     )r  	dataclassr  r  r]  rD   r   rv  rH   rc  rM   rW  rX  ro  rw   rk  rl  re   update)r   r  rd   re   r^   r  r_   r  -  s    zFallbackKernel.codegen_argsc                 C   s   | rdd | D }|d S t |tjr,|jS t |ttfrdd |D }dd |D }t|dkrj|d S |D ]}t|jrn|  S qn|d S d S )Nc                 S   s   g | ]}|  r|  qS r^   )r   rK  r^   r^   r_   rp   G  s      z.FallbackKernel.find_device.<locals>.<listcomp>r   c                 S   s   h | ]}t d |qS ra   )r  find_devicer  r^   r^   r_   r   L  s     z-FallbackKernel.find_device.<locals>.<setcomp>c                 S   s   g | ]}|r|qS r^   r^   )rl   r   r^   r^   r_   rp   N  s      r)   )	rH   rM   r  r   rI   rJ   rr   r<   rY   )r  r  ZdevicesZ
device_setr   r^   r^   r_   r  D  s    

zFallbackKernel.find_devicec                 C   s"   t | jtjjrdS t| j S r  )rH   rc  rM   rW  r  r   r  r   r^   r^   r_   r  W  s    zFallbackKernel.has_side_effectsc                 C   s   | j S ra   )r  r   r^   r^   r_   r  \  s    z+FallbackKernel.get_inputs_that_alias_outputc                 C   s   t | jdkst| jS r  )rr   r  rX   r   r^   r^   r_   r  _  s    z!FallbackKernel.get_mutation_namesc           
         s"  t | tst| | j| j\}| |}fdd| jD }tj	j
sR||S td d }|| j|}dd  | j}|jj}t|dkr|d j} || jg}n@t | jtstt|t| jkst fddt|| jD }t|  tj| j ||i dd	}	tj	j|	 ||S )
Nc                    s   g | ]}  |d qS ra   )r  r  )re   r^   r_   rp   m  s    z<FallbackKernel.export_extern_kernel_node.<locals>.<listcomp>c                 S   s   t | tjrP|}t |ttfr6t|dks.t|d }tjj	tj
| ddS t | tjrt |  tjrtjj	dd |D dS tdt|  d S )	Nr)   r   rf   )Z	as_tensorc                 S   s   g | ]}t j| d qS )rf   )r  TensorArgumentr   )rl   r   r^   r^   r_   rp     s   zZFallbackKernel.export_extern_kernel_node.<locals>.handle_single_output.<locals>.<listcomp>)Z
as_tensorszUnsupported return type )rH   rM   r  rI   rJ   rr   rX   r  ZArgumentr  r  r   r  r  RuntimeErrorrY   )return_typeoutputr   r^   r^   r_   handle_single_outputx  s$     zFFallbackKernel.export_extern_kernel_node.<locals>.handle_single_outputr)   r   c                    s   g | ]\}} |j |qS r^   )ri  )rl   Zreturn_schemar  )r  r^   r_   rp     s   )rH  r  r  metadata)rg   r[   )rH   r  rX   r  r  r]  ro  rb  rD   r   Zaot_moder   Zserialize_inputsrc  rk  r	  rr   ri  r  rJ   rw   r  r   r  r   rg   Zextern_kernel_nodesr  )
r   rd   Zordered_kwargs
serializerZnamed_argumentsrH  r	  r  Zoutput_argumentsr[   r^   )r  re   r_   export_extern_kernel_nodei  s@    





z(FallbackKernel.export_extern_kernel_nodec                 C   s  | j }|jdkrnt|tjjs"ttjj	rbt
 rl|tkrlt
jdkrltd| d| _| | n
t|| _n|jdkrt|tjjsttjj	r| | t
jsd| _n
t|| _nTt|tjjrd|j | _n6|jdd d	|j | _tjj	r
d| _| | | jrv| | d }d }t
jr6|  }n|  |  }||  | j| j|| j | j!| j"| j || j#
 nB| | |  |  }tjj$%| | t| j&t'r| (| | )| d S )
NrU  1zG%s is missing a c-shim implementation, using proxy executor as fallbackTZ
_quantizedztorch.ops.higher_order.z._ops.z.ops.rV  )*rc  rY  rH   rM   rW  rX  rX   rD   r   rv  r*   Z	is_fbcode
has_c_shimZc_shim_versionr  r  r  r  ri   r`  rw  r  r   r   rZ  rs  r  r  r  6generate_extern_kernel_alloc_and_find_schema_if_neededr   ra  r  r  r  r  r  Zgenerate_fallback_kernelr  rw  r  r  )r   rr  rQ  Zexported_argsrd   r^   r^   r_   ru    sn    








zFallbackKernel.codegenr  c                 C   s"   t | j| jt|  t|  S ra   )r  r   r   r7   r   r   r"  r^   r^   r_   tensor_to_layout  s    

zFallbackKernel.tensor_to_layoutc              	      s   t jf}||krtjjnt }|"  j|f||\}}}}	}
W 5 Q R X  ||}|d krx t|||||	|
dn$|st	d t
|||||	|
d fdd|g }t|tttfr|_n|g_|S )Nr  z"Not sure where to find device infoc                    s   t ttfr4t fddttD S t trX fdd D S t tj	rvt
 S t trS t tjrjjS d kstdt dd S d S )Nc                 3   s*   | ]"} | t |fg V  qd S ra   rY   rk   generate_outputr  r  r^   r_   r   !  s   zAFallbackKernel.create.<locals>.generate_output.<locals>.<genexpr>c                    s*   i | ]"\}}| |t |fg qS r^   r$  )rl   rv  r  r%  r^   r_   r   &  s    zBFallbackKernel.create.<locals>.generate_output.<locals>.<dictcomp>zFallbackKernel output type z is not supported)rH   rI   rJ   rY   rs   rr   rK   r  rM   r  MultiOutputr#  r   ZSymIntr[   r  rX   )r  r  r  r&  packed)r  r  r_   r&    s.    


z.FallbackKernel.create.<locals>.generate_output)rU  Z*_fused_moving_avg_obs_fq_helper_functionalrD   r   r  r   r  r  r  rX   r  rH   rI   rJ   rK   r  )r  rQ  rd   re   Zfake_incorrect_kernelscontextr  r  r  r  rf  r   r  r^   r(  r_   r    sJ    

	
zFallbackKernel.createc                    s
   t   S ra   )r  rp  r   r  r^   r_   rp  A  s    zFallbackKernel.apply_constraint)N)r   r   r   r  r  r   rS   r   r  r  r  r   r  r  r  r  r  ru  rM   r  r#  r$  r  rp  r&  r^   r^   r  r_   r  _  s(    	u6

BF
Fr  c                       s6   e Zd ZdZdd Zdd Zdd fdd	
Z  ZS )
ComplexViewz9View a complex number as two dtyped numbers or vice versac                 C   s   dS r  r^   r   r^   r^   r_   r  I  s    zComplexView.should_allocatec                 C   s   | j d  gS r   r  r   r^   r^   r_   r  L  s    z(ComplexView.get_inputs_that_alias_outputNr  c                   s   t  j||||||d d S )Nr  r  r  )r   r  rQ  r  r  r  rf  r  r^   r_   r  P  s    
zComplexView.__init__)r   r   r   r  r  r  r  r&  r^   r^   r  r_   r+  E  s
   r+  c                   @   s   e Zd ZU ejed< dS )r  r   N)r   r   r   rM   r   r   r^   r^   r^   r_   r  d  s   
r  c                       sb   e Zd Zdd Zdd Zeeedf  d fddZe	e
j d	d
dZdd Zdd Z  ZS )r'  c                 C   s   t |dkr|d \}}t|trB| | d| d|dd  S t|trztjj|| 	 t
|}| ||dd  S t|tr| | d| d|dd  S td|n|S d S )Nr   r  r  r)   z['z']znon supported index type: )rr   
issubclassrI   codegen_list_tuple_accessrJ   rD   r   r  Zcodegen_tuple_accessr   ri   rK   rX   )r   basenamer  Zityperm   Ztuple_accessr^   r^   r_   r.  m  s    
 
  
 z%MultiOutput.codegen_list_tuple_accessc                 C   s(   | |  | | jd  | j d S r   )Zcodegen_multi_outputr   r.  r  r  rt  r^   r^   r_   ru    s    zMultiOutput.codegen.r  c                    s,   t  d ||gd tj| | _|| _d S r  )r  r  rD   r   r  rg   r  )r   r  rD  r  r  r^   r_   r    s    zMultiOutput.__init__r   c                 C   s   | j d  S r   )r  r   r   r^   r^   r_   r     s    z$MultiOutput.get_unbacked_symbol_usesc                 C   s   dS r  r^   r   r^   r^   r_   r    s    zMultiOutput.should_allocatec                 C   s   dd | j D S )Nc                 S   s.   g | ]&}t |trt| d kr| qS r   )rH   r  rr   r  r   )rl   inpr^   r^   r_   rp     s   
z<MultiOutput.get_inputs_that_alias_output.<locals>.<listcomp>r  r   r^   r^   r_   r    s    z(MultiOutput.get_inputs_that_alias_output)r   r   r   r.  ru  r	   r   r   r  r   rS   r   r   r  r  r&  r^   r^   r  r_   r'  i  s   r'  rR   )	r   r  biaspaddingr   dilationgroups
transposedoutput_paddingc
                 C   s  dd }
dd }|   |   |dk	r0|   tjj t|dd}t|dd}t| d }d	t|  k r||ksn td	t|  k r|ksn td	t|  k r|ksn tt||}t||}t||}|	dkrtd	g|}	n,d	t|	  k r|ksn tt|	|}	t	|t
s.t|r\|||}| }|
||||	|||}n@|dk	rrt|ddn|}tjj||||||||	|	}| }d	gtttd
t|d
  }t|g| }W 5 Q R X | ||}tdd |D  }|rt|rt|}nt|}| jdkr8| jdks<t||g}t| | t|t|}||||g}|r|d
|	 |dk	r|| n|d	| ||||fS )au  
    This function is a helper function to prepare inputs, layout and constant args
    for convolution post-op fusion's create function, including deciding the output
    layout (channels first or channels last), realizing inputs and make them etc. The
    function only supports the CPU device since conv post-op fusion kernel is only
    supported on CPU right now.
    c                 S   s   t | t |kstdt | }|dks0tdd}d}	g }
|
| |  |
||	 |  td|D ]b}|| d ||d   d }| | d ||d   ||d  d  | ||d   }|
| qfttt|
S )NzExpect input dim == weight dimr   zExpect input dim > 2r   r)   )rr   rX   r  rs   rI   r   r   )output_sizeweight_sizer3  r7  r   r4  r5  rO  Z	BATCH_DIMZWEIGHT_INPUT_CHANNELS_DIMrO  r  rQ  Zinput_size_dr^   r^   r_   _conv_input_size  s(    
z<_prepare_convolution_fusion_create.<locals>._conv_input_sizec                 S   s   |   }t|}|dks td|dkrpg }||d |  ||d |  td|D ]}|||  qZn| dd  }|S )Nr   zExpect weight dim > 2r)   r   )r   rr   rX   r  rs   Z	transpose)Zprepacked_weightr5  Zprepacked_weight_sizerO  r9  r  r^   r^   r_   _original_deconv_weight_size  s    zH_prepare_convolution_fusion_create.<locals>._original_deconv_weight_sizeNTr  r   r   r)   c                 s   s   | ]}t |tV  qd S ra   )rH   r   rk   r^   r^   r_   r     s     z5_prepare_convolution_fusion_create.<locals>.<genexpr>r   )r   rD   r   r  r   rr   r   rX   r=   rH   r   rM   rC   rU  ZconvolutionrI   r  rs   r  r  r  r   r   r   r   rY   r  r   r7   insertr  )r  r   r  r2  r3  r   r4  r5  r6  r7  r:  r;  Zx_fakeZweight_fakerJ  r9  rO  r8  Z	bias_faker  req_stride_orderZdynamic_shapesrP  r  kernel_layoutr]  r^   r^   r_   "_prepare_convolution_fusion_create  s    


"

 	$
r?  )r   r  r2  c                 C   s   |   |   |dk	r |   | ^ }}| \}}t||g }tttt| }| ||}| jdkr| jdkst	||g}	t
|}
t| | ||
}g }|dk	r|	| n|d| |	|||fS )z
    This function is a helper function to prepare inputs, layout and constant args
    for linear post-op fusion's create function. The function only supports the CPU device
    since linear post-op fusion kernel is only supported on CPU right now.
    Nr   r   )r   r   rI   r  rs   rr   r  r   rY   rX   r   r   r  r   r  r<  )r  r   r  r2  mr   ocr8  r=  r  rP  r>  r]  r^   r^   r_   _prepare_linear_fusion_create5  s.     
rB  c                       sZ   e Zd Zd
 fdd	Zdd Zedddee ee ee eeee	  ddd	Z
  ZS )ConvolutionUnaryr^   c                    s(   t  j|||d ddd d| _d| _d S )Nz'torch.ops.mkldnn._convolution_pointwisemkldnn::_convolution_pointwiser  Zconvolution_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)r  r  r  r  r  r  r^   r_   r  b  s    zConvolutionUnary.__init__c                 C   s>   | |  | j| j|  | j| j t| jt	r:| 
| d S ra   )r!  r   r`  ra  r  r  r  rH   r  rw  r  rt  r^   r^   r_   ru  ~  s    zConvolutionUnary.codegenrR   )r   r  r2  padding_stride_	dilation_r5  scalarsc              	   C   s>   t | |||||||\}}}}||t|	|
g }t|||dS )Nr  r  r]  )r?  r   rC  )r  r   r  r2  rF  rG  rH  r5  attrrI  	algorithmr  r]  r>  r   r^   r^   r_   r    s(           zConvolutionUnary.create)r^   r   r   r   r  ru  r$  r	   r   r
   r   r  r&  r^   r^   r  r_   rC  a  s    
rC  c                       sp   e Zd Zd
 fdd	Zdd Zeddddee ee ee eee	e
 e	e e	ee  e	e ddd	Z  ZS )ConvolutionBinaryr^   c                    s4   t  j|||d ddd d| _d| _d| _|| _d S )Nz.torch.ops.mkldnn._convolution_pointwise.binaryrD  r  binaryZconvolution_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm))r  r  r  r  r  cpp_constant_args)r   r  r  r]  rP  r  r^   r_   r    s    zConvolutionBinary.__init__c              	   C   sB   | |  | j| j|  | j| j| j t| j	t
r>| | d S ra   )r!  r   r`  ra  r  r  r  r  rH   r  rw  r  rt  r^   r^   r_   ru    s    	zConvolutionBinary.codegenrR   r   r  r  r2  rF  rG  rH  r5  binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmc              	   C   sZ   t | |||||||\}}}}| ||}|d| ||	|
|t||g }t|||dS )Nr)   rJ  )r?  r  r<  r   rN  )r  r   r  r  r2  rF  rG  rH  r5  rR  rS  rT  rU  rV  r  r]  r>  r=  r^   r^   r_   r    s:           zConvolutionBinary.create)r^   r^   )r   r   r   r  ru  r$  r	   r   ri   r
   r*  r   r  r&  r^   r^   r  r_   rN    s&     "
rN  c                       s   e Zd Zd fdd	Zdd Zdd Zeej dd	d
Z	e
ddddee ee ee eeee ee eee  ee dddZ  ZS )ConvolutionBinaryInplacer^   c                    sJ   |d |d g|dd   }t  j|||d ddd d| _d| _d	| _d S )
Nr)   r   r   z/torch.ops.mkldnn._convolution_pointwise_.binaryzmkldnn::_convolution_pointwise_r  rO  Zconvolution_pointwise_binary_a  
            at::Tensor&(
                at::Tensor& other_t,
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> unary_attr,
                torch::List<c10::optional<at::Scalar>> unary_scalars,
                c10::optional<c10::string_view> unary_algorithm)r  r  r  r  r  )r   r>  r  r]  Zreordered_inputsr  r^   r_   r    s    z!ConvolutionBinaryInplace.__init__c              	   C   s,   | |  | j| j|  | j| j| j d S ra   r!  r   r`  ra  r  r  r  r  rt  r^   r^   r_   ru  %  s    z ConvolutionBinaryInplace.codegenc                 C   s   | j d  gS r   r  r   r^   r^   r_   r  0  s    z+ConvolutionBinaryInplace.get_mutation_namesr   c                 C   s   t  S ra   r  r   r^   r^   r_   r  3  s    z1ConvolutionBinaryInplace.get_unbacked_symbol_defsrR   rQ  c              	   C   s~   t | |||||||\}}}}| ||}|d| ||	|
|t||g }tt|d  ||d}t||d  |jd S )Nr)   )r>  r  r]  r   )	r?  r  r<  r   rW  r  r   r"  r  )r  r   r  r  r2  rF  rG  rH  r5  rR  rS  rT  rU  rV  r  r]  r   r=  r)  r^   r^   r_   r  6  s>           zConvolutionBinaryInplace.create)r^   )r   r   r   r  ru  r  r   rS   r   r  r$  r	   r   ri   r
   r*  r   r  r&  r^   r^   r  r_   rW     s(    $
rW  c                       s2   e Zd Zd fdd	Zdd Zedd Z  ZS )	MKLPackedLinearr^   c                    s(   t  j|||d ddd d| _d| _d S )Nztorch.ops.mkl._mkl_linearzmkl::_mkl_linearr  Z
mkl_lineara  
            at::Tensor(
                const at::Tensor& self,
                const at::Tensor& mkl_weight_t,
                const at::Tensor& origin_weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                const int64_t prepack_batch_size)rE  r  r  r^   r_   r  e  s    zMKLPackedLinear.__init__c                 C   s(   | |  | j| j|  | j| j d S ra   r!  r   r`  ra  r  r  r  rt  r^   r^   r_   ru  |  s    zMKLPackedLinear.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}}t||g }	t|	}
|||g}|g}|d k	rv||g7 }n|dd  tt|	 |
 |	|
||dS )Nr   rJ  )r  ri  r   rI   r   r   r<  rZ  r  r   r   )r  r   Zpacked_wZorig_wBZ
batch_sizer@  r   rA  r8  rP  r  r]  r^   r^   r_   r    s*    

   zMKLPackedLinear.create)r^   )r   r   r   r  ru  r$  r  r&  r^   r^   r  r_   rZ  d  s
    
rZ  c                       s:   e Zd Zd
 fdd	Zdd Zedd Zdd	 Z  ZS )LinearUnaryr^   c                    s(   t  j|||d ddd d| _d| _d S )Nz"torch.ops.mkldnn._linear_pointwisemkldnn::_linear_pointwiser  Zlinear_pointwiseaL  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)rE  r  r  r^   r_   r    s    zLinearUnary.__init__c                 C   s(   | |  | j| j|  | j| j d S ra   r[  rt  r^   r^   r_   ru    s    zLinearUnary.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}	}||g}
||rL|ndg|g}|d k	rz|  | |}|
| n|dd  tt| | t	||	g d|
|dS )Nr  r   r  rJ  )
r  ri  r   r  r<  r]  r   r   r   rI   )r  r   wrR  rK  rI  rL  r@  icrA  r  r]  r^   r^   r_   r    s&    zLinearUnary.createc                 C   s   d S ra   r^   r   r^   r^   r_   rp    s    zLinearUnary.apply_constraint)r^   )	r   r   r   r  ru  r$  r  rp  r&  r^   r^   r  r_   r]    s    

r]  c                       s>   e Zd ZdZd fdd	Zdd Zedd Zd	d
 Z  Z	S )LinearBinary)torch.ops.mkldnn._linear_pointwise.binaryr^   c                    s.   t  j|||d ddd d| _d| _d| _d S )Nrb  r^  r  rO  Zlinear_pointwise_binarya  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& other_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                c10::string_view attr)
        rX  r  r  r^   r_   r    s    zLinearBinary.__init__c              	   C   s,   | |  | j| j|  | j| j| j d S ra   rY  rt  r^   r^   r_   ru    s    zLinearBinary.codegenc                 C   s   |  | |}|  | |}|  | |}| ^ }}| \}}|||g}	|g}
|d k	r~|  | |}|	| n|
d| tt| | t	||g d|	|
dS )Nr   r  rJ  )
r  ri  r   r  r<  ra  r   r   r   rI   )r  r   yr_  r\  rK  r@  r`  rA  r  r]  r^   r^   r_   r    s(    
zLinearBinary.createc                 C   s   d S ra   r^   r   r^   r^   r_   rp     s    zLinearBinary.apply_constraint)r^   )
r   r   r   rQ  r  ru  r$  r  rp  r&  r^   r^   r  r_   ra    s    
ra  c                       s`   e Zd Zd
 fdd	Zdd Zedddee ee ee ee eeee	  d	dd	Z
  ZS )ConvolutionTransposeUnaryr^   c                    s(   t  j|||d ddd d| _d| _d S )Nz1torch.ops.mkldnn._convolution_transpose_pointwisez(mkldnn::_convolution_transpose_pointwiser  Zconvolution_transpose_pointwisea  
            at::Tensor(
                const at::Tensor& input_t,
                const at::Tensor& weight_t,
                const c10::optional<at::Tensor>& bias_opt,
                at::IntArrayRef padding,
                at::IntArrayRef output_padding,
                at::IntArrayRef stride,
                at::IntArrayRef dilation,
                int64_t groups,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm)rE  r  r  r^   r_   r  %  s    z"ConvolutionTransposeUnary.__init__c                 C   s(   | |  | j| j|  | j| j d S ra   r[  rt  r^   r^   r_   ru  B  s    z!ConvolutionTransposeUnary.codegenrR   )	r   r  r2  rF  output_padding_rG  rH  groups_rI  c                 C   sF   d}t | |||||||||
\}}}}||	t|
|g }t|||dS )NTrJ  )r?  r   rd  )r  r   r  r2  rF  re  rG  rH  rf  rK  rI  rL  r6  r  r]  r>  r   r^   r^   r_   r  L  s8    z ConvolutionTransposeUnary.create)r^   rM  r^   r^   r  r_   rd  $  s    

rd  c                       sR   e Zd Zd fdd	Zedddddddeee eeeeeeedddZ  Z	S )	MkldnnRnnLayerr^   c                    s   t  j|||d ddd d S )Nzaten.mkldnn_rnn_layerzat::mkldnn_rnn_layerr  r,  r  r  r^   r_   r  z  s    zMkldnnRnnLayer.__init__rR   )r   w0w1w2w3hxcxrs  batch_sizesr<  hidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc              	      sZ  |  |   |  | |}|  | |}|  | |}|  | |}|  | |}|  |  | |}|   }t|dkstd|\}}}|||g}| }| }g }||||||g}||	|
||||||g	}tt ||d dd }|||g}|||t	
|t	
|g} fddtt||D }|S )Nr   zExpect lstm input to be 3D)r  r]  c                 S   s   t | dkstdt| S )Nr   zExpect output_shape to be 3D)rr   rX   r   r   )output_shapers  r^   r^   r_   get_strides_of_lstm_output  s    z9MkldnnRnnLayer.create.<locals>.get_strides_of_lstm_outputc                    s8   g | ]0\}\}}t t  || t|fgqS r^   )r'  r  r   r   rJ   )rl   rm   r8  rP  r)  r   r^   r_   rp     s   
z)MkldnnRnnLayer.create.<locals>.<listcomp>)r  ri  r!  r   rr   rX   rg  r  r   r   r   r   rw   )r  r   rh  ri  rj  rk  rl  rm  rs  rn  r<  ro  rp  rq  rr  rs  rt  rO  Z
seq_lengthZ
mini_batchru  Zhy_shapeZcy_shaperesr  r]  rv  Zoutput_sizesZoutput_stridesZ	output_irr^   rw  r_   r    sZ    



zMkldnnRnnLayer.create)r^   )
r   r   r   r  r$  r   r	   r   r  r&  r^   r^   r  r_   rg  y  s(    rg  c                       s\   e Zd Zd
 fdd	Zdd Zedeeddddee ee ee eeeddd	Z	  Z
S )QConvPointWisePT2Er^   c                    s6   t |dk| _t j|||dddd d| _d| _dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        r  Nz"torch.ops.onednn.qconv2d_pointwiseonednn::qconv2d_pointwiser  Zqconv2d_pointwisea  
            at::Tensor(
                at::Tensor act,
                double act_scale,
                int64_t act_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                torch::List<int64_t> stride,
                torch::List<int64_t> padding,
                torch::List<int64_t> dilation,
                int64_t groups,
                double output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::string_view attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm))rr   has_biasr  r  r  r  r  r  r^   r_   r    s    zQConvPointWisePT2E.__init__c                 C   s   dd | j D }g }||   |d }|d }| jr@|d n|d }|d |d  }}|dd  \}	}
}}}}}}}}}}||||||||	|
||||||||f}||  | j| j|| j| j	 t
| jtr| | d S )	Nc                 S   s   g | ]}|  qS r^   r  r  r^   r^   r_   rp     s     z.QConvPointWisePT2E.codegen.<locals>.<listcomp>r   r)   r   r  )r  rA  r  r{  r!  r   r`  ra  r  r  rH   r  rw  r  )r   rr  rd   
const_argsr   packed_weightr2  w_scalew_zpr   r3  r4  r5  x_scalex_zpo_inv_scaleo_zpoutput_dtyperT  rU  rV  r  r^   r^   r_   ru    sb    
zQConvPointWisePT2E.codegenrR   )r   r  r  r  r  r  r2  rG  rF  rH  r5  r  output_zero_pointc                 C   s   d}d }t | ||||	||
|||
\}}}}|d krN|d |d  |d< |d< n|d |d  |d< |d< |  |  |||g }|||||||t||g }|d k	r|tjtjfkst||_t|||dS )NFr   r)   r   rJ  )	r?  r   r   rM   float32r  rX   r   ry  )r  r   r  r  r  r  r  r2  rG  rF  rH  r5  r  r  r  rT  rU  rV  r6  r7  r  r]  r>  r   r^   r^   r_   r  L  sL    zQConvPointWisePT2E.create)r^   )r   r   r   r  ru  r$  r*  r   r	   r  r&  r^   r^   r  r_   ry    s$    .7ry  c                       sr   e Zd Zd fdd	Zdd Zdd Zeej dd	d
Z	e
ddddee ee ee eddd
ddZ  ZS )QConvPointWiseBinaryPT2Er^   c                    sL   t |dk| _| jrdnd| _t j|||dddd d| _d	| _d
| _dS )a~  
        Needs input/weight/output qparams
        if bias is not None
            - inputs = [x, w, b, accum, w_scale, w_zp]
            - const_args = [stride, padding, dilation, groups, x_scale, x_zp, accum_scale, accum_zp, o_inv_scale, o_zp,
            fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, accum, w_scale, w_zp]
            - const_args = const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, accum_scale,
            accum_zp, o_inv_scale, o_zp, fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
           r   r   Nz)torch.ops.onednn.qconv2d_pointwise.binaryrz  r  rO  Zqconv2d_pointwise_binarya  
            at::Tensor(
                at::Tensor act,
                double act_scale,
                int64_t act_zero_point,
                at::Tensor accum,
                double accum_scale,
                int64_t accum_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                torch::List<int64_t> stride,
                torch::List<int64_t> padding,
                torch::List<int64_t> dilation,
                int64_t groups,
                double output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::string_view binary_attr,
                c10::optional<at::Scalar> alpha,
                c10::optional<c10::string_view> attr,
                torch::List<c10::optional<at::Scalar>> scalars,
                c10::optional<c10::string_view> algorithm))rr   r{  idx_for_inplace_sumr  r  r  r  r  r  r  r^   r_   r    s    z!QConvPointWiseBinaryPT2E.__init__c                 C   s   dd | j D }g }||   |d }|d }| jr@|d n|d }|d |d |d   }}}	|d	d  \}
}}}}}}}}}}}}}}}|||||||||	||
|||||||||||f}||  | j| j|| j| j	| j
 t| jtr| | d S )
Nc                 S   s   g | ]}|  qS r^   r  r  r^   r^   r_   rp     s     z4QConvPointWiseBinaryPT2E.codegen.<locals>.<listcomp>r   r)   r   r|  r  i)r  rA  r  r{  r!  r   r`  ra  r  r  r  rH   r  rw  r  )r   rr  rd   r~  r   r  r2  accumr  r  r   r3  r4  r5  r  r  accum_scaleaccum_zpr  r  r  rR  alpharT  rU  rV  Z	conv_argsr^   r^   r_   ru    sv    
	z QConvPointWiseBinaryPT2E.codegenc                 C   s   | j | j  gS ra   )r  r  r   r   r^   r^   r_   r    s    z+QConvPointWiseBinaryPT2E.get_mutation_namesr   c                 C   s   t  S ra   r  r   r^   r^   r_   r  
  s    z1QConvPointWiseBinaryPT2E.get_unbacked_symbol_defsrR   )
r   r  r  r2  rG  rF  rH  r5  r  r  c                 C   s   d}d }t | |||
||||||
\}}}}| ||}|| |
d krd|d |d  |d< |d< n|d |d  |d< |d< |  |	  |||	g }|||||||||||t||g }|dkstdtt| ||d}t	|| |j
|j S )NFr   r)   r   rF  zCFor now, only post op sum is supported in QConvPointWiseBinaryPT2E.rJ  )r?  r  r  r   r   rX   r  r  r   r"  r  r  )r  r   r  r  r  r  r  r  r  r  r2  rG  rF  rH  r5  r  r  r  rR  r  rT  rU  rV  r6  r7  r  r]  r>  r=  r)  r^   r^   r_   r    sh    


zQConvPointWiseBinaryPT2E.create)r^   )r   r   r   r  ru  r  r   rS   r   r  r$  r	   r   r  r&  r^   r^   r  r_   r    s"    6@r  c                       sH   e Zd Zd fdd	Zdd Zedeeddddeed		d
dZ  Z	S )QLinearPointwisePT2Er^   TFc                    sh   || _ || _t j|||d|r"dnddd |r4dnd| _d| _|rHd	nd
\}}d| d| d| _dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        Nz)torch.ops.onednn.qlinear_pointwise.tensorz*torch.ops.onednn.qlinear_pointwise.defaultonednn::qlinear_pointwiser  r  r   Zqlinear_pointwise
at::Tensorr  doubleZint64_tI
            at::Tensor(
                at::Tensor act,
                 act_scale,
                a   act_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                double output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::string_view post_op_name,
                torch::List<c10::optional<at::Scalar>> post_op_args,
                c10::string_view post_op_algorithm)r{  x_scale_zp_are_tensorsr  r  r  r  r  r   r  r  r]  r{  r  Zx_scale_type_strZx_zp_type_strr  r^   r_   r  f  s.    zQLinearPointwisePT2E.__init__c                 C   s&  dd | j D }g }||   |d }|d }| jr@|d n|d }|d |d  }}| jrt|dkspt|d	 |d
  }	}
|dd  \}}}}}}n,t|dkst|dd  \}	}
}}}}}}||	|
||||||||||f}||  | j	| j
|| j| j| j t| jtr"| | d S )Nc                 S   s   g | ]}|  qS r^   r  r  r^   r^   r_   rp     s     z0QLinearPointwisePT2E.codegen.<locals>.<listcomp>r   r)   r   r|  r  r   r  iro  ir  rA  r  r{  r  rr   rX   r!  r   r`  ra  r  r  r  rH   r  rw  r  )r   rr  rd   r~  r   r  r2  r  r  r  r  r  r  r  rT  rU  rV  r  r^   r^   r_   ru    sn    
	

	zQLinearPointwisePT2E.codegenrR   )	r   r  r  r  r  r  r2  r  r  c                 C   s   t | |||\}}}}t|trLt|trL|  |  |||g }d}n(t|tr`t|tsdt|||g }d}|  |  |||g }|||	|
|t||g }|
d k	r|
tj	tj
fkst|
|_t||||d k	|dS )NTFr  r  r]  r{  r  )rB  rH   rR   r   r*  r   rX   r   rM   r  r  r   r  )r  r   r  r  r  r  r  r2  r  r  r  rT  rU  rV  r  r]  r>  r   r  r^   r^   r_   r    sF    	zQLinearPointwisePT2E.create)r^   TF
r   r   r   r  ru  r$  r*  r   r  r&  r^   r^   r  r_   r  e  s       7=r  c                       sJ   e Zd Zd fdd	Zdd Zedeeddddeedd	
d
dZ  Z	S )QLinearPointwiseBinaryPT2Er^   TFc                    sh   || _ || _t j|||d|r"dnddd |r4dnd| _d| _|rHd	nd
\}}d| d| d| _dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp, x2]
            - const_args is: [x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp, x2]
            - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        Nz0torch.ops.onednn.qlinear_pointwise.binary_tensorz)torch.ops.onednn.qlinear_pointwise.binaryr  r  Zbinary_tensorrO  Zqlinear_pointwise_binaryr  r  r  r  a   act_zero_point,
                at::Tensor weight,
                at::Tensor weight_scales,
                at::Tensor weight_zero_points,
                c10::optional<at::Tensor> bias,
                double inv_output_scale,
                int64_t output_zero_point,
                c10::optional<c10::ScalarType> output_dtype,
                c10::optional<at::Tensor> other,
                double other_scale,
                int64_t other_zero_point,
                c10::string_view binary_post_op,
                double binary_alpha,
                c10::string_view unary_post_op,
                torch::List<c10::optional<at::Scalar>> unary_post_op_args,
                c10::string_view unary_post_op_algorithm)r  r  r  r^   r_   r    s0    
z#QLinearPointwiseBinaryPT2E.__init__c                 C   sJ  dd | j D }g }||   |d }|d }| jr@|d n|d }|d |d |d   }}}	| jrt|d	kszt|d
 |d  }
}|dd  \
}}}}}}}}}}n4t|dkst|dd  \}
}}}}}}}}}}}||
|||||||||	|||||||f}||  | j	| j
|| j| j| j t| jtrF| | d S )Nc                 S   s   g | ]}|  qS r^   r  r  r^   r^   r_   rp   W  s     z6QLinearPointwiseBinaryPT2E.codegen.<locals>.<listcomp>r   r)   r   r  r|  r  r  r  iro  r}  r  )r   rr  rd   r~  r   r  r2  r  r  r  r  r  r  r  r  other_scaleother_zprR  r  rT  rU  rV  r  r^   r^   r_   ru  U  s    

	z"QLinearPointwiseBinaryPT2E.codegenrR   )
r   r  r  r  r  r  r2  r  r  r  c                 C   sF  t | |||\}}}}t|trLt|trL|  |  |||g }d}n(t|tr`t|tsdt|||g }d}|  |  |||g }|dkr| ||}|| |||	|
|||||t	||g
 }|dkr
t
t| |||d k	|d}t|| |jd S |
d k	r0|
tjtjfks*t|
|_t
||||d k	|dS )NTFrF  r  r  )rB  rH   rR   r   r*  r   rX   r  r  r   r  r  r   r"  r  rM   r  r  r   )r  r   r  r  r  r  r  r2  r  r  r  r  r  r  rR  r  rT  rU  rV  r  r]  r>  r=  r  r)  r^   r^   r_   r    sr    





z!QLinearPointwiseBinaryPT2E.create)r^   TFr  r^   r^   r  r_   r    s"      >Jr  c                   @   s|   e Zd ZU dZeed< dd Zdd Zee	j
 ddd	ZdddZedd Zdd Zdd Zedd Zdd ZeZd
S )r  zC
    TensorBox / StorageBox allow in-place mutation of Tensors
    r  c                 C   s8   t | j|}t|r|S tt| jj d| dd S )NrV  z not callable)rb   r  callableAttributeErrorrY   r   )r   rg   rh   r^   r^   r_   __getattr__   s    zMutableBox.__getattr__c                 C   s
   | j  S ra   r4  r   r^   r^   r_   r     s    zMutableBox.realizer   c                 C   s
   | j  S ra   r'  r   r^   r^   r_   r   	  s    z#MutableBox.get_unbacked_symbol_usesNc                 C   s   | j |S ra   )r  r   r   r^   r^   r_   r     s    zMutableBox.codegen_referencec                 C   s
   | j  S ra   r/  r   r^   r^   r_   r    s    zMutableBox.layoutc                 C   s   | j S ra   r~  r   r^   r^   r_   r     s    zMutableBox.get_layoutc                 C   s
   | j  S ra   )r  r   r   r^   r^   r_   r     s    zMutableBox.get_sizec                 C   s   | j jS ra   r.  r   r^   r^   r_   r     s    zMutableBox.dtypec                 C   sn   t | jtr8t| j dt| jj d}d}| jj}nt| j d}| j}d}|tt||g}d|S )Nr  z))r  
)rH   r  r  rY   r   r   ri   r   )r   Zline0Zendlr+  r   r^   r^   r_   r    s    

zMutableBox.__str__)N)r   r   r   r  r   r   r  r   r   rS   r   r   r   r8  r  r   r   r   r  r#  r^   r^   r^   r_   r    s   



r  c                   @   s   e Zd Zedd ZdS )rR   c                 C   s   t t| S ra   )rR   r  )r  r^   r^   r_   r  2  s    zTensorBox.createN)r   r   r   r   r  r^   r^   r^   r_   rR   1  s   c                   @   sT   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd Ze	dd Z
e	dd ZdS )r  c                 C   s&   t | jttfr"| j tjjkS dS r  )rH   r  r  r#  r   rD   r   graph_inputsr   r^   r^   r_   rN  8  s    zStorageBox.is_input_bufferc                 C   s   t | jto| j tjjkS ra   )rH   r  r7  r   rD   r   r  r   r^   r^   r_   r5  =  s    zStorageBox.is_module_bufferc                 C   s   t | jtttttfr | j S t | jtt	t
fs@tt| j| j }| j }td t| j | j | j d| jd| _tj| j| j_| j| j_|| j_|| j_| jjS )Nr  r  )rH   r  r  r  r  r#  r  r   r,  r]  r   rX   rY   r  r   r   r   r   r   rD   r   r  rg   r   r  r   )r   r  r   r^   r^   r_   r   C  s6    

 

	
zStorageBox.realizec                 C   s0   t | jttfr,|  dkr,|  r,|   dS )zL
        Called on buffers we expect to be forced to realize later.
        r)   N)rH   r  r,  r]  r  8is_pointwise_non_scalar_tensor_num_reads_larger_than_oner   r   r^   r^   r_   r   a  s    
zStorageBox.realize_hintc                 C   s"   t | jto |  tjkp |  S ra   )rH   r  r,  r  r*   Zrealize_acc_reads_thresholdr  r   r^   r^   r_   r   l  s    z!StorageBox.has_exceeded_max_readsc                 C   sd   t ttf ddd}|dkr`t| jttfr`|  tjksX|  sXt	| jr`|| jr`| 
  dS )zj
        A heuristic to decide if we should realize a tensor
        that is used multiple times.
        )loopsc                    s$   dg}|    t fdd|D S )zW
            The heuristic for realizing reused result of heavy ops on cpu
            expc                 3   s   | ]}|d   kV  qdS )r  Nr^   )rl   opZfn_strr^   r_   r   ~  s     zGStorageBox.mark_reuse.<locals>.should_realize_on_cpu.<locals>.<genexpr>)r  rB  )r  Z	heavy_opsr^   r  r_   should_realize_on_cpux  s    z4StorageBox.mark_reuse.<locals>.should_realize_on_cpur)   N)r   r,  r]  rH   r  r  r*   Zrealize_reads_thresholdr  r   r   )r   r3  r  r^   r^   r_   r   r  s    		zStorageBox.mark_reusec                 C   sz   | j }t|tttfrdS t|tr.| }nBt|ttfsHt	t
|td t| | | d|d }t|jS )Nr)   r  r  )r  rH   r  r  r#  r  r  r,  r]  rX   rY   r   r   r   r   rr   r   )r   r  r  r^   r^   r_   r    s     


	zStorageBox.num_readsc                 C   sD   t | jtr@tdd | j D r@tdd | j D dkS dS )Nc                 s   s   | ]}t |tj V  qd S ra   )rH   r+   r  rl   readr^   r^   r_   r     s   zVStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_one.<locals>.<genexpr>c                 s   s   | ]}|j d kV  qdS )r   Nrt   r  r^   r^   r_   r     s     r)   T)rH   r  r,  r  r   rF  r   r^   r^   r_   r    s    
zCStorageBox.is_pointwise_non_scalar_tensor_num_reads_larger_than_oneN)r   r   r   rN  r5  r   r   r   r   r5   r  r  r^   r^   r^   r_   r  7  s   
r  c                   @   s2   e Zd ZU eed< ejjed< dZe	d ed< dS )Subgraphrg   graph_moduleNrE   r   )
r   r   r   ri   r   rM   r   GraphModuler   r
   r^   r^   r^   r_   r    s   
r  c                 C   s(   dd | D } t dd | D t | k S )Nc                 S   s"   g | ]}t |tr| n|qS r^   )rH   r#  r7  rl   r  r^   r^   r_   rp     s   z(_has_aliased_buffers.<locals>.<listcomp>c                 S   s   h | ]}t |qS r^   )idr  r^   r^   r_   r     s     z'_has_aliased_buffers.<locals>.<setcomp>rN  )buffersr^   r^   r_   _has_aliased_buffers  s    r  c                       s   e Zd ZU dZee ed< dZeee	  ed< dZ
ee ed< dZee ed< dZeee  ed< eee	 eeed fdd	Zee	eeee	 d
ddZdd Z  ZS )ConditionalN	predicateoperandstrue_subgraphfalse_subgraphr  r  r  r  r  r  c                    s^   || _ || _|| _|| _g }t|ts0|| || t j	d ||d t
j| | _d S NrM  )r  r  r  r  rH   r  r  rA  r  r  rD   r   r  rg   )r   r  r  r  r  r  r  r  r^   r_   r    s    


zConditional.__init__)r  true_fnfalse_fnr  c              
      s    |} fdd|D }tjjjd }dd |D }||fD ]J}|jd kr@tjj|j||jd|_t|j |jj	|  W 5 Q R X q@|jj
}|jj
}	d|fd|	ffD ]$\}
}t|rtd|
 d	| qt|t|	kst||	ftt||	D ]\}\}}| | ks(t|||f| | ksHt|||f| | ksht|||f| | kst|||f| j| jkst|||fqt|ts| }n"t|d
kstd|d
  }t||||t|dfddt|D }|_|S )Nc                    s   g | ]}  |qS r^   r  r  r  r^   r_   rp     s     z&Conditional.create.<locals>.<listcomp>r  c                 S   s   g | ]}|j d  qS r  r  r  r^   r^   r_   rp     s     ZgmZexample_inputsZsubgraph_namer  r  zVOutput aliasing is currently not supported in compiled torch.cond. The outputs of the z% subgraph of torch.cond are aliased: r   zQWhen predicate is not a Tensor, there must be at least one operand in torch.cond.r  c              
      sF   g | ]>\}}t t| | | | | jd  t|fgqS r  	r'  r  r   r   r   r   r   r?  rI   rl   rm   r  )conditionalr^   r_   rp     s   )ri  rD   r   r!  rd   make_subgraphr  rg   set_graph_handlerrungraph_outputsr  rX   rr   r   rw   r   r   r   r   r   r?  rH   r  r  r  r  )r  r  r  r  r  Zfx_operandsZfake_operandssubgraphZtrue_outputsZfalse_outputsrg   r  rm   tofor   r^   )r  r  r_   r    s\    

    $


zConditional.createc                 C   s   | |  d S ra   )Zcodegen_conditionalrt  r^   r^   r_   ru  1  s    zConditional.codegen)r   r   r   r  r
   r   r   r  r	   rR   r  r  r  r  r'  r  r  r$  r  ru  r&  r^   r^   r  r_   r    s$   
Qr  c                       s   e Zd ZU dZeee  ed< dZeee  ed< dZ	ee
 ed< dZee
 ed< dZeee  ed< ee ee e
e
ed fdd	Zee
e
ee ee d
ddZdd Z  ZS )	WhileLoopNcarried_inputsadditional_inputscond_subgraphbody_subgraphr  r  r  r  r  r  c                    s@   || _ || _|| _|| _t jd ||| d tj| | _	d S r  )
r  r  r  r  r  r  rD   r   r  rg   )r   r  r  r  r  r  r  r^   r_   r  =  s    zWhileLoop.__init__)cond_fnbody_fnr  r  c              
      s   fdd|D } fdd|D }|| }t jjjd t jjjd  }dd |D }||fD ]J}|jd kr^t jj|j||jd|_t |j |jj|  W 5 Q R X q^|jj	}	|jj	}
t
|
rtd|
 t|	d	kst|	|	d
  tjkst|	t|	d
  d
kst|	t|d
ks2td|d
  }t|t|
ks\t||
ftt||
D ]\}\}}| | kst|||f| | kst|||f| |   kr|ksn t||||f| | kst|||f| j| jksjt|||fqjt||||t|dfddt|
D }t||D ].\}}| t jjkrft jj|  qf|_|S )Nc                    s   g | ]}  |qS r^   r  r  r  r^   r_   rp   Z  s     z$WhileLoop.create.<locals>.<listcomp>c                    s   g | ]}  |qS r^   r  r  r  r^   r_   rp   [  s     r|  r  c                 S   s   g | ]}|j d  qS r  r  r  r^   r^   r_   rp   _  s     r  zOutput aliasing is currently not supported in compiled torch.while_loop. The outputs of the body_fn subgraph of torch.while_loop are aliased: r)   r   z9torch.while_loop is assumed to have at least one operand.r  c              
      sF   g | ]>\}}t t| | | | | jd  t|fgqS r  r  r  )
while_loopr^   r_   rp     s   )rD   r   r!  rd   r  r  rg   r  r  r  r  rX   rr   r   rM   r   r   r   r   rw   r   r   r?  r  r  r   r  r  rA  r  )r  r  r  r  r  Z
all_inputsZfx_all_inputsZfake_all_inputsr  Zcond_outputsZbody_outputsr   rm   r  Zbor  r0  r   r^   )r  r  r_   r  R  sd    

  2 (	
zWhileLoop.createc                 C   s   | |  d S ra   )Zcodegen_while_looprt  r^   r^   r_   ru    s    zWhileLoop.codegen)r   r   r   r  r
   r	   rR   r   r  r  r  r  r  r'  r  r  r$  r  ru  r&  r^   r^   r  r_   r  5  s$   
\r  c                       s8   e Zd Zd	dd fddZ fddZdd Z  ZS )
rW   Nr  c          
   	      sj   t  j|||||d |d ddlm} |||||}	|	d k	sBt|	| _tjj	|	d | _
| tjj|	< d S )N)re   rf  r   )get_effect_key)r  r  Ztorch._higher_order_ops.effectsr  rX   effect_typerD   r   Zeffectful_opsr  prev_effect_buffer)
r   r  rQ  r  r  r  re   rf  r  r  r  r^   r_   r    s    
zEffectfulKernel.__init__c                    s0   t   }| jd k	r,|jt| j  |S ra   )r  r  r  r   rA  r+   r  r   )r   r  r  r^   r_   r    s    

zEffectfulKernel.get_read_writesc                 C   s   dS r  r^   r   r^   r^   r_   r    s    z EffectfulKernel.has_side_effects)N)r   r   r   r  r  r  r&  r^   r^   r  r_   rW     s    	
rW   c                   @   s<   e Zd ZU eed< ejjed< dd Zdd Z	d
dd	Z
dS )rF  rg   r   c                 C   s   | j S ra   rf   r   r^   r^   r_   r     s    zTorchBindObject.get_namec                 C   s   d S ra   r^   r   r^   r^   r_   r     s    zTorchBindObject.get_deviceNc                 C   s   | j S ra   rf   r   r^   r^   r_   r     s    z!TorchBindObject.codegen_reference)N)r   r   r   ri   r   rM   Z_CZScriptObjectr   r   r   r^   r^   r^   r_   rF    s
   
rF  c                       sV   e Zd Zeeddd Z fddZej	j
ed fddZ fd	d
Z  ZS )InterpreterShimNc                   C   s   t jtS ra   )rM   r   Zsymbolic_tracer   r^   r^   r^   r_   	_dummy_gm  s    zInterpreterShim._dummy_gmc                    s>   t  j|  dd | | _|| _|| _d| _|j| _d | _	d S )NF)Zgarbage_collect_values)
r  r  r  moduler   
submodulesZextra_tracebackr  Z
fetch_attrr!  r   r   r  r  r^   r_   r    s    zInterpreterShim.__init__)r  r   c                    s   || _ t |S ra   )r!  r  run_node)r   r  r  r^   r_   r    s    zInterpreterShim.run_nodec              
      s0   t |  t j||W  5 Q R  S Q R X d S ra   )rD   Zset_interpreter_handlerr  r  )r   rd   re   r  r^   r_   r    s    zInterpreterShim.run)r   r   r   r   r  	lru_cacher  r  rM   r   r   r   r  r  r&  r^   r^   r  r_   r    s   r  c                       sx   e Zd ZdZ fddZedd Zedd Zdd	 Ze	j
d
ddZdd Zdd Zdd Zdd Zdd Z  ZS )r  z
    Captures the body of a Loops subclass into an FX graph.  Persists any
    indexing simplifications and makes it easier to analyze loop bodies.
    c                    sj   t    || _i | _i | _g | _g | _i | _i | _g | _	d| j
i| _i | _g | _t| ||| _d | _d S )N	get_index)r  r  r  r  indexing_exprs_namer   writesr  r  r  r  r  	subblocksindirect_varsLoopBodyBlock
root_blockindexing)r   rh   rd   r  r  r^   r_   r    s    
zLoopBody.__init__c                 C   s0   t | jjfdd | j D }dd |D S )Nc                 s   s   | ]}|j V  qd S ra   )r   )rl   blockr^   r^   r_   r   "  s     z%LoopBody.get_nodes.<locals>.<genexpr>c                 S   s   g | ]}|j D ]}|qqS r^   )rZ   )rl   r   r[   r^   r^   r_   rp   $  s       z&LoopBody.get_nodes.<locals>.<listcomp>)r  r  r  r   r  rL   )r   Z
all_graphsr^   r^   r_   	get_nodes  s
    zLoopBody.get_nodesc                 C   s   ddl m} || S )Nr)   )	BoundVars)boundsr  )r   r  r^   r^   r_   r  &  s    zLoopBody.boundsc                 C   s`   dt | j g}|dd | j D  |dd td| jfg| j D  d	|S )Nzvar_ranges = c                 S   s   g | ]\}}| d | qS )r  r^   )rl   rg   r  r^   r^   r_   rp   /  s     z&LoopBody.debug_str.<locals>.<listcomp>c                 S   s   g | ]\}}| |qS r^   )	debug_str)rl   rg   r  r^   r^   r_   rp   1  s   r  r  )
rK   r  rA  r  r  r  r  r  r  r   r   r^   r^   r_   r  -  s    
 zLoopBody.debug_str)r  c                 C   sd   t | || |d k	r,|t | | d|< || jkrZdt| j }|| j|< || j|< | j| S )NZ
_name2exprrn   )rb   r  r  rr   r  )r   r  categorybuf_namerg   r^   r^   r_   add_index_expr:  s    


zLoopBody.add_index_exprc                 C   s<   |d   r|| jkr|}n| t| j }|| j|< |S )zaNot actually for nn.Modules, but subblocks in generated code are mapped to FX call_module opcodesr  )	isnumericr  rr   )r   r  rG   rg   r^   r^   r_   add_submoduleD  s
    
zLoopBody.add_submodulec                 C   s"   t tjt| j}| j| |S ra   )r@   r(   ZINDIRECTrr   r  r  )r   r   rt  r^   r^   r_   add_indirectM  s    zLoopBody.add_indirectc                    sB   t t  krdS | jdk	s"t fdd| j D | _dS )z,Swap in a variable used in indirect indexingNc                    s    i | ]\}}|t | iqS r^   ro  r  newr   r^   r_   r   W  s      z-LoopBody.replace_indirect.<locals>.<dictcomp>)ri   r  rX   r  )r   r   r  r^   r  r_   replace_indirectR  s    zLoopBody.replace_indirectc                 C   s   | j d k	st| j | S ra   )r  rX   r   r^   r^   r_   r  Y  s    zLoopBody.get_indexc                    s   t tj|}t|tjks0t|jftfdd|D sJttt	j
 |  fddj D _ }d _|S )Nc                 3   s   | ]}| j kV  qd S ra   )r  r  r   r^   r_   r   `  s     z$LoopBody.__call__.<locals>.<genexpr>c                    s   i | ]\}}|t | qS r^   ro  )rl   rg   r  rp  r^   r_   r   b  s    z%LoopBody.__call__.<locals>.<dictcomp>)rI   r  r  from_iterablerr   r  rX   r  rK   rw   r  r  r  r  r  )r   r  rn   r  r^   )rq  r   r_   __call__]  s     
zLoopBody.__call__)r   r   r   r  r  r5   r  r  r  rS   r   r  r  r  r  r  r  r&  r^   r^   r  r_   r    s   


	r  c                   @   s@   e Zd ZdZeedef ee dddZdd Z	dd	d
Z
dS )r  a  
    Captures the body of a Loops subclass into an FX graph.
    In normal cases there will be a 1:1 mapping between LoopBody and
    LoopBodyBlock, hower in the case of ops.masked() the masked out
    operations will manifest as an extra LoopBodyBlock.
    .)r  rh   rd   c           	   	      s   |_ dfdd	 G  fdddtj}tj tjjjd_	dddi }d	d
l
m} d	dlm} |||j j}tjr||j j}t| t||  W 5 Q R X j_d S )Nc              	      s    dd j| ||fi S )Ncall_moduler  )create_proxyr  r  )r  r  r  r   tracerr^   r_   	add_indexv  s    z)LoopBodyBlock.__init__.<locals>.add_indexc                       s  e Zd Zd_eejd fddZd! fdd	Z fdd	Z	d
d Z
 fddZ fddZeejejed fddZeedef dfddZeeeedf eedf geedf f dfddZdd Zed"fdd	Zefdd ZdS )#z/LoopBodyBlock.__init__.<locals>.CaptureIndexingCaptureIndexing)rg   rn   c                    s    |d|}| j ||S )Nr   )_innerr{  )r   rg   rn   r  r^   r_   r{    s    z4LoopBodyBlock.__init__.<locals>.CaptureIndexing.loadNc                    s    |d|}| j ||||S Nr  )r  r-  )r   rg   rn   r   r<  r   r^   r_   r-    s    z5LoopBodyBlock.__init__.<locals>.CaptureIndexing.storec                    s    |d|}| j |||S r  )r  rc  )r   rg   rn   r   r   r^   r_   rc    s    z?LoopBodyBlock.__init__.<locals>.CaptureIndexing.store_reductionc                    s8   | j |||| d|kr4t fddtdD S  S )Nr  c                 3   s   | ]} | V  qd S ra   r^   rk   r  r^   r_   r     s     zLLoopBodyBlock.__init__.<locals>.CaptureIndexing.reduction.<locals>.<genexpr>r   )r  rb  rJ   rs   )r   r   r_  rW  r   r^   r  r_   rb    s    z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.reductionc                    s:   t |ttjfr"| jt||S  |d}| j||S Nr  )rH   r   rS   r   r  r)  r  )r   rn   r   r   r^   r_   r    s    
z:LoopBodyBlock.__init__.<locals>.CaptureIndexing.index_exprc                    s&    |d} |d}| j ||||S r  )r  check_bounds)r   rn   r   r  r  r   r^   r_   r    s    

z<LoopBodyBlock.__init__.<locals>.CaptureIndexing.check_bounds)offsets_nameoffsets_sizeindexing_dtyper  c                    s    |d}| j |||||S r  )r  	bucketize)r   rL   r  r  r  r  r   r^   r_   r    s    
    z9LoopBodyBlock.__init__.<locals>.CaptureIndexing.bucketize.)masked_bodyc                    sH    fdd}j |d}tj |g   j j|< d|| |fi S )zb
                Recursively capture the masked out body in another LoopBodyBlock
                c                    s   t j|  |S ra   )rD   rC   r  )rS  r  Zsubblockr^   r_   shim  s    zDLoopBodyBlock.__init__.<locals>.CaptureIndexing.masked.<locals>.shimZmasked_subblockr  )r  r  r  r  r  )Z
mask_proxyr	  Zother_proxyr  rg   r  r
  r_   r    s       z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.maskedrX  c                    sL    fdd}j |d}d|| |fi tfddtt|D S )Nc                    s   t j|  |S ra   )rD   rC   r  )r  rL   r  r^   r_   r    s    zBLoopBodyBlock.__init__.<locals>.CaptureIndexing.scan.<locals>.shimr  r  c                 3   s   | ]} | V  qd S ra   r^   rk   r  r^   r_   r     s     zGLoopBodyBlock.__init__.<locals>.CaptureIndexing.scan.<locals>.<genexpr>)r  r  r  rJ   rs   rr   )Zdtype_proxyrX  value_proxyr  rg   r  )rX  r  r_   r    s    z4LoopBodyBlock.__init__.<locals>.CaptureIndexing.scanc                 S   s   | j |}|d |d fS r>  )r  frexp)r   r  r  r^   r^   r_   r    s    z5LoopBodyBlock.__init__.<locals>.CaptureIndexing.frexpTc                    sD   j  fdd}dj |d | fi  S )z
                Flow data from tensors into indexing formulas.
                Introduce a call_module to update the indexing.
                c                    s   j tj|   d S ra   )r  r  rD   rC   indirect_indexing)Znew_var)checkr   r   rt  r^   r_   set_indirect  s     zWLoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexing.<locals>.set_indirectr  set_)r  r  r  r  )Zindex_proxyr   r  r  r  )r  r   rt  r_   r    s    zALoopBodyBlock.__init__.<locals>.CaptureIndexing.indirect_indexingc                    s     dd| fi  d S )Nr  )r  r  )r  r^   r_   r    s    z6LoopBodyBlock.__init__.<locals>.CaptureIndexing.output)N)T)r   r   r   rg   ri   rS   r   r{  r-  rc  rb  r  r  rM   r   r   r  r   r   r   r  r   r  r  r  r  r^   r  r   r  r^   r_   r  ~  s0   "r  )Z
tracer_clsplaceholderrC   r^   r)   )IndexPropagation)SimplifyIndexing)N)r  rD   ZWrapperHandlerrM   r   ZTracerZGraphr	  r   r  Zindex_propagationr  r   r  r  r*   Zconstant_and_index_propagationr  rC   r  )	r   r  rh   rd   r  Z	proxy_opsr  r  handlerr^   r  r_   r  s  s"    s
 zLoopBodyBlock.__init__c                 C   s"   | j }| jj}t||t S ra   )r   r  r  r  r  rD   Zget_ops_handlerr  r^   r^   r_   r     s    zLoopBodyBlock.__call__r  c              
   C   s8   t j| jj| jj}tdd|	 
dd| dS )Nz;[^\n]*r   zdef forward(zdef r  )rM   r   r  r  r  r   coderesubstriprZ  )r   rg   r  r^   r^   r_   r  
   s    zLoopBodyBlock.debug_strN)r  )r   r   r   r  r  r   r   r	   r  r  r  r^   r^   r^   r_   r  k  s
    r  c                   @   sb   e Zd Zdd Zdd Zdd Zeeee	e f ddd	d
Z
eeee	e f dddZdS )_CollectiveKernelc                 C   s   dS r  r^   r   r^   r^   r_   r     s    z!_CollectiveKernel.should_allocatec                 C   s   dS r  r^   r   r^   r^   r_   r     s    z"_CollectiveKernel.has_side_effectsc                 C   s^   ddl m} |jj| _|jj| _| jdd d| j | _||| _	dd |jj
D | _d S )Nr)   r  r  r   c                 S   s   g | ]}|j r|jqS r^   rj  r  r^   r^   r_   rp   %   s     z4_CollectiveKernel.set_cpp_kernel.<locals>.<listcomp>)r  r  rk  rg   ra  r
  r  rZ  r  r  rl  rb  r  r^   r^   r_   r     s    


z _CollectiveKernel.set_cpp_kernelN)r  r   c              	   O   s   |j }|dd}tjj$ | j||f||\}}}	}
}W 5 Q R X |rZt| d| |D ]}|  q^| t|d 	 |||	|
}||_
||_t|ft|  d S )Nr  rV  rX  r   )_namerZ  rD   r   r  r  rX   r   r  r   ra  r`  r"  r|  Ztree_leaves)r  rQ  r  rd   re   ra  r`  r  r  r  r  rf  
tensor_argr)  r^   r^   r_   create_inplace/   s0    


z _CollectiveKernel.create_inplacer1  c              	      s   |j }|dd}tjj$  j||f||\}}}	}
}W 5 Q R X |rZt| d| |D ]}|  q^t|t	r 
||} t||||	|
|_|_ fddt|D _jS   ||||	|
|_|_g_S d S )Nr  rV  r]  c                    s(   g | ] \}}t  |t|fgqS r^   )r'  r#  rI   )rl   rm   r  r  r)  r^   r_   rp      s   z9_CollectiveKernel.create_out_of_place.<locals>.<listcomp>)r  rZ  rD   r   r  r  rX   r   rH   rI   r  r  ra  r`  r   r  r#  )r  rQ  r  rd   re   ra  r`  r  r  r  r  rf  r  r   r^   r   r_   create_out_of_placec   sP    



z%_CollectiveKernel.create_out_of_place)r   r   r   r  r  r  r$  r   rR   r	   r  r!  r^   r^   r^   r_   r     s   3r  c                       s8   e Zd Zdd ZeeddddZ fddZ  ZS )	_WaitKernelc                 C   sd   | j d }t|tr |j d gS t|tr\|j d }t|trX|jd \}}|j | gS g S g S d S r   )r  rH   r  r'  r  )r   r0  Zcollr   r   r^   r^   r_   get_volatile_reads   s    




z_WaitKernel.get_volatile_readsN)r0  r   c           	   	   C   sf   t jj | ||\}}}}}W 5 Q R X |r@t| d| | t| ||||}t|| d S )NrX  )rD   r   r  r  rX   r  r   r"  )	r  rQ  r0  r  r  r  r  rf  r)  r^   r^   r_   create_wait   s$    



z_WaitKernel.create_waitc                    s6   t   }|  }|D ]}|jt|  q|S ra   )r  r  r#  r   rA  r+   r  r   )r   r  Zvolatile_readsZvrr  r^   r_   r     s
    
z_WaitKernel.get_read_writes)	r   r   r   r#  r$  rR   r$  r  r&  r^   r^   r  r_   r"     s   r"  c                 C   sd   t | ttjfrt| S t | ttfrFt }| D ]}|t|O }q0|S t | t	j
rZt| S t S d S ra   )rH   r$   rS   r   r!   rJ   rI   r   r  rM   r  )r   r  r   r^   r^   r_   r     s    r  )T)T)TFNF)FN(  rB  r   r  r  r  loggingr  textwrapr   r   r   typingr   r   r   r   r   r	   r
   r   r   r   r   r   Zunittest.mockr   rS   r   r   Ztorch._export.serde.schemaZ_exportZserder  r  Ztorch._loggingrM   Ztorch.fxZtorch.utils._pytreer  Z_pytreer|  Ztorch._dynamo.device_interfacer   Ztorch._dynamo.utilsr   Ztorch._export.serde.serializer   Z*torch._higher_order_ops.auto_functionalizer   Ztorch._inductorr   Ztorch._prims_commonr   r   r   r   r   Ztorch._subclasses.fake_tensorr   Z%torch.fx.experimental.symbolic_shapesr   r   r    r!   r"   r#   r$   Ztorch.utils._sympy.functionsr%   r&   r'   Ztorch.utils._sympy.symbolr(   r   r*   r+   Zcodegen.commonr,   r-   r.   r/   r0   Zops_handlerr1   Zruntime.hintsr2   Zruntime.runtime_utilsr3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   ZvirtualizedrC   rD   r   rE   	getLoggerr   r  r   rU  r`   rj   rz   r|   r   r  r  r   r   r   r   r   r   r   r   r   r  r   r+  r,  r9  r[  r\  r]  r  r  r   r  r   r  r  rj  r%  r&  rO   rI  rT  r\  r  r#  r  r  r  r  r  r  rw  r  r   r  r  r  r   r  r7  r  r  r  r  r  r*  r   ri   r,  r#  r-  r/  r:  r=  r  rH  rI  r\  rh  r  r  rS  r  r"  r  r  r  r  r  r  r  r  r  rP   rQ   r  Z_embedding_bagr  Z_fft_c2cZ'_scaled_dot_product_efficient_attentionZ#_scaled_dot_product_flash_attentionZ
_scaled_mmZaddmmr   ZbmmZcopy_mmZrepeat_interleaver  Znonzeror  r   Zview_as_realr   r  r+  r  r'  r?  rB  rC  rN  rW  rZ  r]  ra  rd  rg  ry  r  r  r  r  rR   r  r  r  r  r  rW   rF  r   ZInterpreterr  r  r  r  r"  r  r^   r^   r^   r_   <module>   s  8$	D
$	
\u
5     Z   6      
( WF): KK		 U$P   36&"$19     [)(r-3K0#0   i9 
 ,FYd:@FUm + V 2 c8v	w},c * 7