461 lines
7.9 KiB
YAML
461 lines
7.9 KiB
YAML
|
_name_or_path:
|
||
|
value: T5Autocorrection_Book
|
||
|
_wandb:
|
||
|
value:
|
||
|
cli_version: 0.18.3
|
||
|
m:
|
||
|
- "1": train/global_step
|
||
|
"6":
|
||
|
- 3
|
||
|
"7": []
|
||
|
python_version: 3.11.10
|
||
|
t:
|
||
|
"1":
|
||
|
- 1
|
||
|
- 5
|
||
|
- 11
|
||
|
- 49
|
||
|
- 51
|
||
|
- 53
|
||
|
- 55
|
||
|
- 71
|
||
|
"2":
|
||
|
- 1
|
||
|
- 5
|
||
|
- 11
|
||
|
- 49
|
||
|
- 51
|
||
|
- 53
|
||
|
- 55
|
||
|
- 71
|
||
|
"3":
|
||
|
- 7
|
||
|
- 13
|
||
|
- 19
|
||
|
- 23
|
||
|
- 55
|
||
|
- 66
|
||
|
"4": 3.11.10
|
||
|
"5": 0.18.3
|
||
|
"6": 4.44.1
|
||
|
"8":
|
||
|
- 5
|
||
|
"9":
|
||
|
"1": transformers_trainer
|
||
|
"12": 0.18.3
|
||
|
"13": linux-x86_64
|
||
|
accelerator_config:
|
||
|
value:
|
||
|
dispatch_batches: null
|
||
|
even_batches: true
|
||
|
gradient_accumulation_kwargs: null
|
||
|
non_blocking: false
|
||
|
split_batches: false
|
||
|
use_seedable_sampler: true
|
||
|
adafactor:
|
||
|
value: false
|
||
|
adam_beta1:
|
||
|
value: 0.9
|
||
|
adam_beta2:
|
||
|
value: 0.999
|
||
|
adam_epsilon:
|
||
|
value: 1e-08
|
||
|
add_cross_attention:
|
||
|
value: false
|
||
|
architectures:
|
||
|
value:
|
||
|
- MT5ForConditionalGeneration
|
||
|
auto_find_batch_size:
|
||
|
value: false
|
||
|
bad_words_ids:
|
||
|
value: null
|
||
|
batch_eval_metrics:
|
||
|
value: false
|
||
|
begin_suppress_tokens:
|
||
|
value: null
|
||
|
bf16:
|
||
|
value: false
|
||
|
bf16_full_eval:
|
||
|
value: false
|
||
|
bos_token_id:
|
||
|
value: null
|
||
|
chunk_size_feed_forward:
|
||
|
value: 0
|
||
|
classifier_dropout:
|
||
|
value: 0
|
||
|
cross_attention_hidden_size:
|
||
|
value: null
|
||
|
d_ff:
|
||
|
value: 1024
|
||
|
d_kv:
|
||
|
value: 64
|
||
|
d_model:
|
||
|
value: 512
|
||
|
data_seed:
|
||
|
value: null
|
||
|
dataloader_drop_last:
|
||
|
value: false
|
||
|
dataloader_num_workers:
|
||
|
value: 0
|
||
|
dataloader_persistent_workers:
|
||
|
value: false
|
||
|
dataloader_pin_memory:
|
||
|
value: true
|
||
|
dataloader_prefetch_factor:
|
||
|
value: null
|
||
|
ddp_backend:
|
||
|
value: null
|
||
|
ddp_broadcast_buffers:
|
||
|
value: null
|
||
|
ddp_bucket_cap_mb:
|
||
|
value: null
|
||
|
ddp_find_unused_parameters:
|
||
|
value: null
|
||
|
ddp_timeout:
|
||
|
value: 1800
|
||
|
debug:
|
||
|
value: []
|
||
|
decoder_start_token_id:
|
||
|
value: 0
|
||
|
deepspeed:
|
||
|
value: null
|
||
|
dense_act_fn:
|
||
|
value: gelu_new
|
||
|
disable_tqdm:
|
||
|
value: false
|
||
|
dispatch_batches:
|
||
|
value: null
|
||
|
diversity_penalty:
|
||
|
value: 0
|
||
|
do_eval:
|
||
|
value: true
|
||
|
do_predict:
|
||
|
value: false
|
||
|
do_sample:
|
||
|
value: false
|
||
|
do_train:
|
||
|
value: false
|
||
|
dropout_rate:
|
||
|
value: 0.1
|
||
|
early_stopping:
|
||
|
value: false
|
||
|
encoder_no_repeat_ngram_size:
|
||
|
value: 0
|
||
|
eos_token_id:
|
||
|
value: 1
|
||
|
eval_accumulation_steps:
|
||
|
value: null
|
||
|
eval_delay:
|
||
|
value: 0
|
||
|
eval_do_concat_batches:
|
||
|
value: true
|
||
|
eval_on_start:
|
||
|
value: false
|
||
|
eval_steps:
|
||
|
value: null
|
||
|
eval_strategy:
|
||
|
value: epoch
|
||
|
eval_use_gather_object:
|
||
|
value: false
|
||
|
evaluation_strategy:
|
||
|
value: epoch
|
||
|
exponential_decay_length_penalty:
|
||
|
value: null
|
||
|
feed_forward_proj:
|
||
|
value: gated-gelu
|
||
|
finetuning_task:
|
||
|
value: null
|
||
|
forced_bos_token_id:
|
||
|
value: null
|
||
|
forced_eos_token_id:
|
||
|
value: null
|
||
|
fp16:
|
||
|
value: true
|
||
|
fp16_backend:
|
||
|
value: auto
|
||
|
fp16_full_eval:
|
||
|
value: false
|
||
|
fp16_opt_level:
|
||
|
value: O1
|
||
|
fsdp:
|
||
|
value: []
|
||
|
fsdp_config:
|
||
|
value:
|
||
|
min_num_params: 0
|
||
|
xla: false
|
||
|
xla_fsdp_grad_ckpt: false
|
||
|
xla_fsdp_v2: false
|
||
|
fsdp_min_num_params:
|
||
|
value: 0
|
||
|
fsdp_transformer_layer_cls_to_wrap:
|
||
|
value: null
|
||
|
full_determinism:
|
||
|
value: false
|
||
|
gradient_accumulation_steps:
|
||
|
value: 64
|
||
|
gradient_checkpointing:
|
||
|
value: false
|
||
|
gradient_checkpointing_kwargs:
|
||
|
value: null
|
||
|
greater_is_better:
|
||
|
value: null
|
||
|
group_by_length:
|
||
|
value: false
|
||
|
half_precision_backend:
|
||
|
value: auto
|
||
|
hub_always_push:
|
||
|
value: false
|
||
|
hub_model_id:
|
||
|
value: null
|
||
|
hub_private_repo:
|
||
|
value: false
|
||
|
hub_strategy:
|
||
|
value: every_save
|
||
|
hub_token:
|
||
|
value: <HUB_TOKEN>
|
||
|
id2label:
|
||
|
value:
|
||
|
"0": LABEL_0
|
||
|
"1": LABEL_1
|
||
|
ignore_data_skip:
|
||
|
value: false
|
||
|
include_inputs_for_metrics:
|
||
|
value: false
|
||
|
include_num_input_tokens_seen:
|
||
|
value: false
|
||
|
include_tokens_per_second:
|
||
|
value: false
|
||
|
initializer_factor:
|
||
|
value: 1
|
||
|
is_decoder:
|
||
|
value: false
|
||
|
is_encoder_decoder:
|
||
|
value: true
|
||
|
is_gated_act:
|
||
|
value: true
|
||
|
jit_mode_eval:
|
||
|
value: false
|
||
|
label_names:
|
||
|
value: null
|
||
|
label_smoothing_factor:
|
||
|
value: 0
|
||
|
label2id:
|
||
|
value:
|
||
|
LABEL_0: 0
|
||
|
LABEL_1: 1
|
||
|
layer_norm_epsilon:
|
||
|
value: 1e-06
|
||
|
learning_rate:
|
||
|
value: 2e-05
|
||
|
length_column_name:
|
||
|
value: length
|
||
|
length_penalty:
|
||
|
value: 1
|
||
|
load_best_model_at_end:
|
||
|
value: false
|
||
|
local_rank:
|
||
|
value: 0
|
||
|
log_level:
|
||
|
value: passive
|
||
|
log_level_replica:
|
||
|
value: warning
|
||
|
log_on_each_node:
|
||
|
value: true
|
||
|
logging_dir:
|
||
|
value: ./results4/runs/Nov09_15-40-04_quadro
|
||
|
logging_first_step:
|
||
|
value: false
|
||
|
logging_nan_inf_filter:
|
||
|
value: true
|
||
|
logging_steps:
|
||
|
value: 500
|
||
|
logging_strategy:
|
||
|
value: steps
|
||
|
lr_scheduler_type:
|
||
|
value: linear
|
||
|
max_grad_norm:
|
||
|
value: 1
|
||
|
max_length:
|
||
|
value: 20
|
||
|
max_steps:
|
||
|
value: -1
|
||
|
metric_for_best_model:
|
||
|
value: null
|
||
|
min_length:
|
||
|
value: 0
|
||
|
model/num_parameters:
|
||
|
value: 300176768
|
||
|
model_type:
|
||
|
value: t5
|
||
|
mp_parameters:
|
||
|
value: ""
|
||
|
neftune_noise_alpha:
|
||
|
value: null
|
||
|
no_cuda:
|
||
|
value: false
|
||
|
no_repeat_ngram_size:
|
||
|
value: 0
|
||
|
num_beam_groups:
|
||
|
value: 1
|
||
|
num_beams:
|
||
|
value: 1
|
||
|
num_decoder_layers:
|
||
|
value: 8
|
||
|
num_heads:
|
||
|
value: 6
|
||
|
num_layers:
|
||
|
value: 8
|
||
|
num_return_sequences:
|
||
|
value: 1
|
||
|
num_train_epochs:
|
||
|
value: 3
|
||
|
optim:
|
||
|
value: adamw_torch
|
||
|
optim_args:
|
||
|
value: null
|
||
|
optim_target_modules:
|
||
|
value: null
|
||
|
output_attentions:
|
||
|
value: false
|
||
|
output_dir:
|
||
|
value: ./results4
|
||
|
output_hidden_states:
|
||
|
value: false
|
||
|
output_scores:
|
||
|
value: false
|
||
|
overwrite_output_dir:
|
||
|
value: false
|
||
|
pad_token_id:
|
||
|
value: 0
|
||
|
past_index:
|
||
|
value: -1
|
||
|
per_device_eval_batch_size:
|
||
|
value: 1
|
||
|
per_device_train_batch_size:
|
||
|
value: 1
|
||
|
per_gpu_eval_batch_size:
|
||
|
value: null
|
||
|
per_gpu_train_batch_size:
|
||
|
value: null
|
||
|
prediction_loss_only:
|
||
|
value: false
|
||
|
prefix:
|
||
|
value: null
|
||
|
problem_type:
|
||
|
value: null
|
||
|
push_to_hub:
|
||
|
value: false
|
||
|
push_to_hub_model_id:
|
||
|
value: null
|
||
|
push_to_hub_organization:
|
||
|
value: null
|
||
|
push_to_hub_token:
|
||
|
value: <PUSH_TO_HUB_TOKEN>
|
||
|
ray_scope:
|
||
|
value: last
|
||
|
relative_attention_max_distance:
|
||
|
value: 128
|
||
|
relative_attention_num_buckets:
|
||
|
value: 32
|
||
|
remove_invalid_values:
|
||
|
value: false
|
||
|
remove_unused_columns:
|
||
|
value: true
|
||
|
repetition_penalty:
|
||
|
value: 1
|
||
|
report_to:
|
||
|
value:
|
||
|
- wandb
|
||
|
restore_callback_states_from_checkpoint:
|
||
|
value: false
|
||
|
resume_from_checkpoint:
|
||
|
value: null
|
||
|
return_dict:
|
||
|
value: true
|
||
|
return_dict_in_generate:
|
||
|
value: false
|
||
|
run_name:
|
||
|
value: ./results4
|
||
|
save_on_each_node:
|
||
|
value: false
|
||
|
save_only_model:
|
||
|
value: false
|
||
|
save_safetensors:
|
||
|
value: true
|
||
|
save_steps:
|
||
|
value: 500
|
||
|
save_strategy:
|
||
|
value: epoch
|
||
|
save_total_limit:
|
||
|
value: null
|
||
|
seed:
|
||
|
value: 42
|
||
|
sep_token_id:
|
||
|
value: null
|
||
|
skip_memory_metrics:
|
||
|
value: true
|
||
|
split_batches:
|
||
|
value: null
|
||
|
suppress_tokens:
|
||
|
value: null
|
||
|
task_specific_params:
|
||
|
value: null
|
||
|
temperature:
|
||
|
value: 1
|
||
|
tf_legacy_loss:
|
||
|
value: false
|
||
|
tf32:
|
||
|
value: null
|
||
|
tie_encoder_decoder:
|
||
|
value: false
|
||
|
tie_word_embeddings:
|
||
|
value: false
|
||
|
tokenizer_class:
|
||
|
value: T5Tokenizer
|
||
|
top_k:
|
||
|
value: 50
|
||
|
top_p:
|
||
|
value: 1
|
||
|
torch_compile:
|
||
|
value: false
|
||
|
torch_compile_backend:
|
||
|
value: null
|
||
|
torch_compile_mode:
|
||
|
value: null
|
||
|
torch_dtype:
|
||
|
value: float32
|
||
|
torch_empty_cache_steps:
|
||
|
value: null
|
||
|
torchdynamo:
|
||
|
value: null
|
||
|
torchscript:
|
||
|
value: false
|
||
|
tpu_metrics_debug:
|
||
|
value: false
|
||
|
tpu_num_cores:
|
||
|
value: null
|
||
|
transformers_version:
|
||
|
value: 4.44.1
|
||
|
typical_p:
|
||
|
value: 1
|
||
|
use_bfloat16:
|
||
|
value: false
|
||
|
use_cache:
|
||
|
value: true
|
||
|
use_cpu:
|
||
|
value: false
|
||
|
use_ipex:
|
||
|
value: false
|
||
|
use_legacy_prediction_loop:
|
||
|
value: false
|
||
|
use_mps_device:
|
||
|
value: false
|
||
|
vocab_size:
|
||
|
value: 250112
|
||
|
warmup_ratio:
|
||
|
value: 0
|
||
|
warmup_steps:
|
||
|
value: 0
|
||
|
weight_decay:
|
||
|
value: 0.01
|