
    Pg؄                        d dl mZmZmZmZ d dlZd dlmZ d dlZddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% dZ& ejN                  e(      Z) G d de      Z* G d de#      Z+ G d de!      Z,	 	 	 d,dejZ                  dej\                  dej\                  dej\                  deej\                     de/dee/   dee/   deej\                  ej\                  f   fdZ0 G d d e      Z1 G d! d"ejZ                        Z2 G d# d$e"      Z3 G d% d&e      Z4 G d' d(e      Z5 G d) d*e       Z6g d+Z7y)-    )CallableOptionalTupleUnionN   )ACT2FN)CacheHybridCacheStaticCache)PretrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)logging   )	GemmaAttentionGemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassificationGemmaMLP
GemmaModelGemmaRMSNormapply_rotary_pos_emb	repeat_kvzgoogle/gemma2-7bc                        e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )Gemma2Configa  
    This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma2-7B.
    e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma2Model`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256): scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096): in Gemma2, every other layer uses sliding window attention. This is the
            size of the sliding window.
        final_logit_softcapping (`float`, *optional*, defaults to 30.0): scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*, defaults to 50.0): scaling factor when applying tanh softcapping on the attention scores.
        cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`.

    ```python
    >>> from transformers import Gemma2Model, Gemma2Config
    >>> # Initializing a Gemma2 gemma2-7b style configuration
    >>> configuration = Gemma2Config()
    >>> # Initializing a model from the gemma2-7b style configuration
    >>> model = Gemma2Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                 F   t        |   d||||d| || _        |	| _        || _        || _        || _        || _        || _        || _	        |
| _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        y )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings )super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappingcache_implementation)selfr2   r4   r5   r6   r7   r9   r8   r@   r3   r:   r;   r<   r+   r-   r,   r.   r=   r>   r?   rA   rB   rC   rD   rE   kwargs	__class__s                             p/var/www/html/suriana-translation/venv/lib/python3.12/site-packages/transformers/models/gemma2/modular_gemma2.pyr1   zGemma2Config.__init__   s    8 	 	
%%% 3		

 	
 %'>$&!2!2#6  #6 !2("$,!2!2%:",'>$&<#$8!    )i  i 	  i $              gelu_pytorch_tanhi    g{Gz?gư>Tr      r   Tg     @F        rN   i   g      >@g      I@hybrid)
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr1   __classcell__rH   s   @rI   r   r   3   s    FP J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 - $ ! $#%369 69rJ   r   c                       e Zd Zy)Gemma2RMSNormN)rS   rT   rU   r/   rJ   rI   r^   r^      s    rJ   r^   c                        e Zd Z fdZ xZS )	Gemma2MLPc                 R    t         |           t        |j                     | _        y N)r0   r1   r   r@   act_fnrF   configrH   s     rI   r1   zGemma2MLP.__init__   s     V556rJ   rS   rT   rU   r1   r[   r\   s   @rI   r`   r`      s    7 7rJ   r`   modulequerykeyvaluer&   dropoutscalingsoftcapreturnc                    || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|#|d d d d d d d |	j                  d   f   }||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j!                         }||fS )	N      r   r   )dimdtype)ptrainingrP   )r8   r   num_key_value_groupstorchmatmul	transposetanhshapenn
functionalsoftmaxfloat32tort   rk   rv   
contiguous)rg   rh   ri   rj   r&   rk   rl   rm   rG   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 rI   eager_attention_forwardr      sA    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!$Q1.D
0@0@0D.D%DE#k1 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$rJ   c                   2    e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	ej                     d	ee   d
eej                  e	ej                     e	eej                        f   fdZ xZS )Gemma2Attentionre   	layer_idxc                    t         |   ||       | j                  j                  | _        | j                  j                  | _        d| _        |j                  dz  | _        t        |dz        s|j                  | _	        y d | _	        y )NTrp   r   )
r0   r1   re   rD   r?   	is_causalrA   rl   boolrB   rF   re   r   rH   s      rI   r1   zGemma2Attention.__init__   sp    +&*kk&H&H#!%!>!>33T9;?	A;Nf33TXrJ   r%   position_embeddingsr&   past_key_valuecache_positionrG   rn   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|~|||| j                  d}|j                  |
|| j                  |      \  }
}|J| j                  j                  dk(  r1|j                   d   }|
d d d d d |d d f   |d d d d d |d d f   }}
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd	      rt        j!                  d
       nt"        | j                  j                     } || |	|
||f| j$                  r| j&                  nd| j(                  | j                  | j*                  d|\  }} |j,                  g |d j/                         }| j1                  |      }||fS )Nrr   rP   r   )sincosr   rB   flash_attention_2eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rQ   )rk   rl   rB   rm   )r|   r8   q_projviewrz   k_projv_projr   rB   updater   re   _attn_implementationr   getloggerwarning_oncer   rv   r?   rl   rD   reshaper   o_proj)rF   r%   r   r&   r   r   rG   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsseq_lenattention_interfacer   r   s                      rI   forwardzGemma2Attention.forward   sa    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j% "0"&"5"5	L (6'<'<ZW[WeWegs't$J )dkk.N.NRe.e(..r2+5aHWHa6G+H,WXZ[]e^e]eghWhJiL
(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7%
 /3mmD**LL..//%
 %
!\ *k));;;;FFHkk+.L((rJ   )NN)rS   rT   rU   r   intr1   rx   Tensorr   r   r	   
LongTensorr   r   r   r[   r\   s   @rI   r   r      s    Y| Y Y +/59;)||;) #5<<#=>;) !.	;)
 !;) !!1!12;) -.;) 
u||Xell3XeELL>Q5RR	S;)rJ   r   c                   d    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                  ej                  f   de	ej                     de	ej                     de	e   d	e	e   d
e	e   de	ej                     dedeej                  e	eej                  ej                  f      f   fdZ xZS )Gemma2DecoderLayerre   r   c                    t         |           |j                  | _        || _        t	        |dz         | _        t        ||      | _        t        |      | _	        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        |j                   | _        y )Nr   )re   r   )eps)r0   r1   r4   re   r   
is_slidingr   	self_attnr`   mlpr^   r;   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormrB   r   s      rI   r1   zGemma2DecoderLayer.__init__;  s    !--"9q=11()LV$,V-?-?VEXEXY(5f6H6HfNaNa(b%)6v7I7IvObOb)c&*78J8JPVPcPc*d'$33rJ   r%   r   r&   position_idsr   r   r<   r   last_cache_positionrn   c
                     | j                   r|t        |j                  d   | j                        }| j                  j
                  dk(  r|d d | d f   }nt        j                  |j                        j                  }t        j                  t        j                  |t        j                        | j                         }t        j                  |||      }|	|z
  }t        d|      }|d d d d d d |||z   f   }|}| j                  |      } | j                  d||||||||d|
\  }}| j!                  |      }||z   }|}| j#                  |      }| j%                  |      }| j'                  |      }||z   }|f}|r||fz  }|S )Nr   r   rt   )diagonal)r%   r   r&   r   r   r   r<   r   r/   )r   maxr|   rB   re   r   rx   finfort   mintril	ones_liker   wherer   r   r   r   r   r   )rF   r%   r   r&   r   r   r   r<   r   r   rG   effective_seq_len	min_dtypesliding_window_maskoffsetresidualself_attn_weightsoutputss                     rI   r   zGemma2DecoderLayer.forwardI  s    ??~9 #N$8$8$;T=P=P Q {{//3FF!/4E3E3F0F!G "KK(<(<=AA	&+jjOON%**EQUQdQdPd'# "'-@)^!\ -/@@Q!/1a&K\B\9\0\!] ,,]; ,:4>> 
,
' 3)%)/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++GrJ   )NNNFFNr   )rS   rT   rU   r   r   r1   rx   r   r   r   r   r	   r   FloatTensorr   r[   r\   s   @rI   r   r   :  s   4| 4 4$ 2637*.,1$)59#$A||A #5<<#=>A !.	A
 u//0A !A $D>A D>A !!1!12A !A 
u  (51B1BEDUDU1U+V"WW	XArJ   r   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     dee	   deej                     dee   d	ee   d
ee   dee   deej                     dee   dee   deeef   fdZ ej&                         dej                  dej                  dej                  de	d	ef
d       Z xZS )Gemma2Modelre   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w rb   )r0   r1   r}   
ModuleListranger6   r   r(   r   s      rI   r1   zGemma2Model.__init__  sD     mmDI&JbJbDcdy	2d
ds   Ar#   r&   r   r    r$   r<   r   output_hidden_statesreturn_dictr   r   flash_attn_kwargsrn   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|rL|J| j                  s>|j                  \  }}}t        | j                   |||j                  | j                        }|
F||j                         nd}t!        j"                  |||j                  d   z   |j                        }
||
j%                  d      }|9d}|5|j'                         dk(  r|j                  d	   n|
d	   j)                         }| j+                  |||
||      }|}| j-                  ||      }t!        j.                  | j                   j0                  d
z  |j                        }||z  }|rdnd }|rdnd }| j2                  d | j                   j4                   D ]j  }|r||fz  }| j                  r1| j                  r%| j7                  |j8                  ||||||||
|
      }n ||f|||||||
|d|}|d   }|sb||d   fz  }l | j;                  |      }|r||fz  }t=        ||||      }|	r|S |j?                         S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)max_batch_sizemax_cache_lenrt   devicer   rP   )r   r   rr   g      ?r   r/   )r   r&   r   r   r   r<   r   r   )last_hidden_stater    r%   
attentions) re   r   r   r<   use_return_dict
ValueErrorgradient_checkpointingrv   r   r   r'   r|   r
   rt   r   get_seq_lengthrx   arange	unsqueezers   item_update_causal_mask
rotary_embtensorr4   r(   r6   _gradient_checkpointing_func__call__r)   r   to_tuple)rF   r#   r&   r   r    r$   r<   r   r   r   r   r   r   
batch_sizer   _past_seen_tokensr   r%   r   
normalizerall_hidden_statesall_self_attnsdecoder_layerlayer_outputsoutputs                             rI   r   zGemma2Model.forward  sW    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M0%2%8%8"J))%#)){{O !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L &"#) 1?0B0B0D0IN((,~^`OaOfOfOh $ ..M>?L]

 & #oom\J
 \\$++"9"93">mFYFYZ
%
2 #7BD0d![[)H4;;+H+HI "	6M#!m%55!**t}} $ A A!**!' #%"'! !.!!(;#.!-#2&7'#1(;! (! *!,M =#3"55E"	6H 		-0-!11(+++%	
 %v;&//*;;rJ   input_tensorc           
      b   | j                   j                  dk(  r|S |j                  |j                  }}|j                  d   }t        |t        t        f      r|j                         }	n ||j                  d   n|j                  d   }	| j                  |||	||||j                  d         }
|
S )Nr   rP   rr   r   sequence_lengthtarget_lengthrt   r   r   r   )
re   r   rt   r   r|   
isinstancer
   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_position)rF   r&   r   r   r    r   rt   r   r   r   r   s              rI   r   zGemma2Model._update_causal_mask  s     ;;++/BB!!$**L,?,?v&,,Q/o['AB+??AM8F8RN004XdXjXjklXmM PP+')#))!, Q 
 rJ   )NNNNNNNNNNN)rS   rT   rU   r   r1   rx   r   r   r   r
   r   r   r   r   r   r   r   r   r   no_gradr   r[   r\   s   @rI   r   r     sx   
| 
 '+15371559$(,0/3&*59-1C<##C< !.C< u//0	C<
 "+.C<   1 12C< D>C< $D>C< 'tnC< d^C< !!1!12C< &c]C< $$89C< 
u--	.C<J U]]_   ll  	 
 %      rJ   r   c                   j    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej
                     dee   deej                     deej
                     dee
   d	ee
   d
ee
   dee
   deej
                     deeej                  f   deeef   fdZ	 	 	 	 	 	 	 d fd	Z xZS )Gemma2ForCausalLMc                 d    t         |   |       t        |      | _        | j	                          y rb   r0   r1   r   model	post_initrd   s     rI   r1   zGemma2ForCausalLM.__init__>  &      (
rJ   r#   r&   r   r    r$   labelsr<   r   r   r   r   logits_to_keeprn   c                    | j                   rF| j                  j                  dk7  r-t        j	                  d| j                  j                   d       ||n| j                  j
                  }|	|	n| j                  j                  }	|
|
n| j                  j                  }
 | j                  d	||||||||	|
|d
|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                   ||| j"                  fi |}|
s|f|dd z   }||f|z   S |S t%        |||j&                  |j(                  |j*                        S )
a  
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            logits_to_keep (`int` or `torch.Tensor`, *optional*):
                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
                This is useful when using packed tensor format (single dimension for batch and sequence length).

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM

        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```r   zhIt is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)
r#   r&   r   r    r$   r<   r   r   r   r   r   rP   )losslogitsr    r%   r   r/   )rv   re   r   r   r   r   r   r   r   r   r   slicelm_headrC   rx   r{   loss_functionr2   r   r    r%   r   )rF   r#   r&   r   r    r$   r  r<   r   r   r   r   r  loss_kwargsr   r%   slice_indicesr  r  r   s                       rI   r   zGemma2ForCausalLM.forwardC  s   ` ==T[[==H#{{??@  Aqr 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
)%+'/!5#)
 
  
8B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooUUDY,F'+'7D7V#CVC%#33!//))
 	
rJ   c	                 4   t        |   |f|||||||d|	}
||j                  d   nd|
d<   ||
j                  dd       }t	        |t
              r|j                  dk(  r| j                  j                  dk(  s|
d   #|
d   j                  \  }}}|
d   j                  }n!|
d	   j                  \  }}|
d	   j                  }| j                  j                  |||j                         | j                  j                  j                  |||
      }||
d<   |
S )N)r    r&   r$   r   r   r<   r  rr   r   r   r  r   r   r$   r#   r   r&   )r0   prepare_inputs_for_generationr|   popr   r
   ndimre   r   r   r   r   r   r  weightrt   )rF   r#   r    r&   r$   r   r   r<   r  rG   model_inputsr   r   r   r   rH   s                  rI   r  z/Gemma2ForCausalLM.prepare_inputs_for_generation  sU    w<

+)')%)

 

 KYJdn.B.B2.Fjk*+!  !148A 4##q(KK448KKO,81=o1N1T1T.
OQ%o6==.:;.G.M.M+
O%k299!ZZ]] /-AACll))//-% ^ N .<L)*rJ   )NNNNNNNNNNNr   )NNNNNTN)rS   rT   rU   r1   rx   r   r   r   r
   r   r   r   r   r   r   r   r  r[   r\   s   @rI   r   r   =  sL    '+15371559-1$(,0/3&*5934`
##`
 !.`
 u//0	`

 "+.`
   1 12`
 ))*`
 D>`
 $D>`
 'tn`
 d^`
 !!1!12`
 c5<</0`
 
u,,	-`
J 7 7rJ   r   c                        e Zd Z fdZ xZS )Gemma2ForSequenceClassificationc                 d    t         |   |       t        |      | _        | j	                          y rb   r   rd   s     rI   r1   z(Gemma2ForSequenceClassification.__init__  r   rJ   rf   r\   s   @rI   r  r         rJ   r  c                        e Zd Z fdZ xZS )Gemma2ForTokenClassificationc                 d    t         |   |       t        |      | _        | j	                          y rb   r   rd   s     rI   r1   z%Gemma2ForTokenClassification.__init__  r   rJ   rf   r\   s   @rI   r  r    r  rJ   r  )r   r   r   Gemma2PreTrainedModelr  r  )rQ   NN)8typingr   r   r   r   rx   torch.nnr}   torch.utils.checkpointactivationsr   cache_utilsr	   r
   r   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   gemma.modeling_gemmar   r   r   r   r   r   r   r   r   _CHECKPOINT_FOR_DOC
get_loggerrS   r   r   r^   r`   Moduler   floatr   r   r   r   r   r  r  __all__r/   rJ   rI   <module>r*     s    4 3    ! : : 3 B 6 & 
 
 
 ) 			H	%P9# P9f	L 	7 7 ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %FD)n D)NP Pfm* m`_( _D&D #> rJ   