def PatchTFTSimple(
c_in:int, # the number of input channels
win_length, # the length of the patch of time/interval or short time ft windown length (when time_domain=False)
hop_length, # the length of the distance between each patch/fft
max_seq_len, # maximum sequence len
time_domain:bool=True, pos_encoding_type:str='learned', # options include learned or tAPE
relative_attn_type:str='vanilla', # options include vanilla or eRPE
use_flash_attn:bool=False, # indicator to use flash attention
use_revin:bool=True, # if time_domain is true, whether or not to instance normalize time data
dim1reduce:bool=False, # indicator to normalize by timepoint in revin
affine:bool=True, # if time_domain is true, whether or not to learn revin normalization parameters
mask_ratio:float=0.1, # amount of signal to mask
augmentations:list=['patch_mask', 'jitter_zero_mask', 'reverse_sequence', 'shuffle_channels'], # the type of mask to use, options are patch or jitter_zero
n_layers:int=2, # the number of transformer encoder layers to use
d_model:int=512, # the dimension of the input to the transofmrer encoder
n_heads:int=2, # the number of heads in each layer
shared_embedding:bool=False, # indicator for whether or not each channel should be projected with its own set of linear weights to the encoder dimension
d_ff:int=2048, # the feedforward layer size in the transformer
norm:str='BatchNorm', # BatchNorm or LayerNorm during trianing
attn_dropout:float=0.0, # dropout in attention
dropout:float=0.1, # dropout for linear layers
act:str='gelu', # activation function
res_attention:bool=True, # whether to use residual attention
pre_norm:bool=False, # indicator to pre batch or layer norm
store_attn:bool=False, # indicator to store attention
pretrain_head:bool=True, # indicator to include a pretraining head
pretrain_head_n_layers:int=1, # how many linear layers on the pretrained head
pretrain_head_dropout:float=0.0, # dropout applied to pretrain head
):