U
    h                     @   s   d dl mZ ejdddZejdddZejdddZejdd	d
ZejdddZejdddZ	ejdddZ
ejdddZejdddZdS )    )coreNc              	   C   s   t jddg t jdd| dS )Nzmov.u64 $0, %globaltimer;z=lF   Zdtypeis_purepack_builder)r   inline_asm_elementwiseZint64r    r
   R/var/www/html/venv/lib/python3.8/site-packages/triton/language/extra/cuda/utils.pyglobaltimer   s    r   c              	   C   s   t jddg t jdd| dS )Nzmov.u32 $0, %smid;z=rTr   r   )r   r   Zint32r	   r
   r
   r   smid
   s    r   c                 C   s   t | jjd S )N    r   Z	constexproptions	num_warpsr	   r
   r
   r   num_threads   s    r   c                 C   s   t | jjS )Nr   r	   r
   r
   r   r      s    r   c              	   C   s   t jdd| gt jdd|dS )Na  {                                      
.reg .b32 a<2>, b<2>;                  
prmt.b32 a0, 0, $2, 0x5746;            
and.b32 b0, a0, 0x7f007f00;            
and.b32 b1, a0, 0x00ff00ff;            
and.b32 a1, a0, 0x00800080;            
shr.b32  b0, b0, 1;                    
add.u32 b1, b1, a1;                    
lop3.b32 $0, b0, 0x80008000, a0, 0xf8; 
shl.b32 $1, b1, 7;                     
}                                      
z=r,=r,rT   r   )r   r   float16)argr   r
   r
   r   convert_fp8e4b15_to_float16!   s    
    r   c              	   C   s>   d}|r|d7 }n|d7 }|d7 }t j|d| gt jdd|dS )	NaN  {
            .reg .pred p<4>;
            .reg .b32 a<2>, b<2>;
            .reg .b16 c<4>;
            .reg .b16 max_val_f16;
            .reg .b32 max_val_f16x2;
            mov.b16 max_val_f16,   0x3F00;
            mov.b32 max_val_f16x2, 0x3F003F00;
            and.b32 a0, $1, 0x7fff7fff;
            and.b32 a1, $2, 0x7fff7fff;zSmin.f16x2 a0, a0, max_val_f16x2;
                  min.f16x2 a1, a1, max_val_f16x2;a  setp.lt.f16x2  p0|p1, a0, max_val_f16x2;
                  setp.lt.f16x2  p2|p3, a1, max_val_f16x2;
                  mov.b32 {c0, c1}, a0;
                  mov.b32 {c2, c3}, a1;
                  selp.b16  c0, c0, max_val_f16, p0;
                  selp.b16  c1, c1, max_val_f16, p1;
                  selp.b16  c2, c2, max_val_f16, p2;
                  selp.b16  c3, c3, max_val_f16, p3;
                  mov.b32 a0, {c0, c1};
                  mov.b32 a1, {c2, c3};zmad.lo.u32 a0, a0, 2, 0x00800080;
              mad.lo.u32 a1, a1, 2, 0x00800080;
              lop3.b32 b0, $1, 0x80008000, a0, 0xea;
              lop3.b32 b1, $2, 0x80008000, a1, 0xea;
              prmt.b32 $0, b0, b1, 0x7531;
              }z=r,r,rTr   r   )r   r   Zfloat8e4b15)r   	has_minx2r   asmr
   r
   r   convert_float16_to_fp8e4b152   s    


r   c                 C   s   | j j r6t| |d}|j r2|jtj|d}|S | j j sR| j j sRt	| }| j j rt|jtj
d|d}t|||d}|S )Nr	   Zrtz)fp_downcast_roundingr   r   r   )typeZscalarZis_fp8e4b15r   Zis_fp32tor   Zfloat32Zis_fp16AssertionErrorr   r   )r   dst_tyr   r   r   Z
upcast_valZdowncast_valr
   r
   r   convert_custom_float8V   s    
r    c                 C   s   t | ||d|dS )NTr   r    r   r   r   r   r
   r
   r   convert_custom_float8_sm80f   s    r#   c                 C   s   t | ||d|dS )NFr   r!   r"   r
   r
   r   convert_custom_float8_sm70k   s    r$   )N)N)N)N)N)N)N)NN)NN)Ztriton.languager   Zexternr   r   builtinr   r   r   r   r    r#   r$   r
   r
   r
   r   <module>   s$   #