_name_or_path: value: T5Autocorrection228 _wandb: value: cli_version: 0.18.3 m: - "1": train/global_step "6": - 3 "7": [] - "1": train/learning_rate "5": 1 "6": - 1 - 3 "7": [] - "1": train/epoch "5": 1 "6": - 1 - 3 "7": [] - "1": train/loss "5": 1 "6": - 1 - 3 "7": [] - "1": train/grad_norm "5": 1 "6": - 1 - 3 "7": [] python_version: 3.11.10 t: "1": - 1 - 5 - 11 - 49 - 51 - 53 - 55 - 71 "2": - 1 - 5 - 11 - 49 - 51 - 53 - 55 - 71 "3": - 7 - 13 - 19 - 23 - 55 - 62 - 66 "4": 3.11.10 "5": 0.18.3 "6": 4.44.1 "8": - 5 "9": "1": transformers_trainer "12": 0.18.3 "13": linux-x86_64 accelerator_config: value: dispatch_batches: null even_batches: true gradient_accumulation_kwargs: null non_blocking: false split_batches: false use_seedable_sampler: true adafactor: value: false adam_beta1: value: 0.9 adam_beta2: value: 0.999 adam_epsilon: value: 1e-08 add_cross_attention: value: false architectures: value: - MT5ForConditionalGeneration auto_find_batch_size: value: false bad_words_ids: value: null batch_eval_metrics: value: false begin_suppress_tokens: value: null bf16: value: false bf16_full_eval: value: false bos_token_id: value: null chunk_size_feed_forward: value: 0 classifier_dropout: value: 0 cross_attention_hidden_size: value: null d_ff: value: 1024 d_kv: value: 64 d_model: value: 512 data_seed: value: null dataloader_drop_last: value: false dataloader_num_workers: value: 0 dataloader_persistent_workers: value: false dataloader_pin_memory: value: true dataloader_prefetch_factor: value: null ddp_backend: value: null ddp_broadcast_buffers: value: null ddp_bucket_cap_mb: value: null ddp_find_unused_parameters: value: null ddp_timeout: value: 1800 debug: value: [] decoder_start_token_id: value: 0 deepspeed: value: null dense_act_fn: value: gelu_new disable_tqdm: value: false dispatch_batches: value: null diversity_penalty: value: 0 do_eval: value: false do_predict: value: false do_sample: value: false do_train: value: false dropout_rate: value: 0.1 early_stopping: value: false encoder_no_repeat_ngram_size: value: 0 eos_token_id: value: 1 eval_accumulation_steps: value: null eval_delay: value: 0 eval_do_concat_batches: value: true eval_on_start: value: false eval_steps: value: null eval_strategy: value: "no" eval_use_gather_object: value: false evaluation_strategy: value: null exponential_decay_length_penalty: value: null feed_forward_proj: value: gated-gelu finetuning_task: value: null forced_bos_token_id: value: null forced_eos_token_id: value: null fp16: value: true fp16_backend: value: auto fp16_full_eval: value: false fp16_opt_level: value: O1 fsdp: value: [] fsdp_config: value: min_num_params: 0 xla: false xla_fsdp_grad_ckpt: false xla_fsdp_v2: false fsdp_min_num_params: value: 0 fsdp_transformer_layer_cls_to_wrap: value: null full_determinism: value: false gradient_accumulation_steps: value: 64 gradient_checkpointing: value: false gradient_checkpointing_kwargs: value: null greater_is_better: value: null group_by_length: value: false half_precision_backend: value: auto hub_always_push: value: false hub_model_id: value: null hub_private_repo: value: false hub_strategy: value: every_save hub_token: value: id2label: value: "0": LABEL_0 "1": LABEL_1 ignore_data_skip: value: false include_inputs_for_metrics: value: false include_num_input_tokens_seen: value: false include_tokens_per_second: value: false initializer_factor: value: 1 is_decoder: value: false is_encoder_decoder: value: true is_gated_act: value: true jit_mode_eval: value: false label_names: value: null label_smoothing_factor: value: 0 label2id: value: LABEL_0: 0 LABEL_1: 1 layer_norm_epsilon: value: 1e-06 learning_rate: value: 2e-05 length_column_name: value: length length_penalty: value: 1 load_best_model_at_end: value: false local_rank: value: 0 log_level: value: passive log_level_replica: value: warning log_on_each_node: value: true logging_dir: value: ./results_book/runs/Nov09_17-15-52_quadro logging_first_step: value: false logging_nan_inf_filter: value: true logging_steps: value: 500 logging_strategy: value: steps lr_scheduler_type: value: linear max_grad_norm: value: 1 max_length: value: 20 max_steps: value: -1 metric_for_best_model: value: null min_length: value: 0 model/num_parameters: value: 300176768 model_type: value: t5 mp_parameters: value: "" neftune_noise_alpha: value: null no_cuda: value: false no_repeat_ngram_size: value: 0 num_beam_groups: value: 1 num_beams: value: 1 num_decoder_layers: value: 8 num_heads: value: 6 num_layers: value: 8 num_return_sequences: value: 1 num_train_epochs: value: 3 optim: value: adamw_torch optim_args: value: null optim_target_modules: value: null output_attentions: value: false output_dir: value: ./results_book output_hidden_states: value: false output_scores: value: false overwrite_output_dir: value: false pad_token_id: value: 0 past_index: value: -1 per_device_eval_batch_size: value: 1 per_device_train_batch_size: value: 1 per_gpu_eval_batch_size: value: null per_gpu_train_batch_size: value: null prediction_loss_only: value: false prefix: value: null problem_type: value: null push_to_hub: value: false push_to_hub_model_id: value: null push_to_hub_organization: value: null push_to_hub_token: value: ray_scope: value: last relative_attention_max_distance: value: 128 relative_attention_num_buckets: value: 32 remove_invalid_values: value: false remove_unused_columns: value: true repetition_penalty: value: 1 report_to: value: - wandb restore_callback_states_from_checkpoint: value: false resume_from_checkpoint: value: null return_dict: value: true return_dict_in_generate: value: false run_name: value: ./results_book save_on_each_node: value: false save_only_model: value: false save_safetensors: value: true save_steps: value: 500 save_strategy: value: epoch save_total_limit: value: null seed: value: 42 sep_token_id: value: null skip_memory_metrics: value: true split_batches: value: null suppress_tokens: value: null task_specific_params: value: null temperature: value: 1 tf_legacy_loss: value: false tf32: value: null tie_encoder_decoder: value: false tie_word_embeddings: value: false tokenizer_class: value: T5Tokenizer top_k: value: 50 top_p: value: 1 torch_compile: value: false torch_compile_backend: value: null torch_compile_mode: value: null torch_dtype: value: float32 torch_empty_cache_steps: value: null torchdynamo: value: null torchscript: value: false tpu_metrics_debug: value: false tpu_num_cores: value: null transformers_version: value: 4.44.1 typical_p: value: 1 use_bfloat16: value: false use_cache: value: true use_cpu: value: false use_ipex: value: false use_legacy_prediction_loop: value: false use_mps_device: value: false vocab_size: value: 250112 warmup_ratio: value: 0 warmup_steps: value: 0 weight_decay: value: 0.01