New Version Usage Issue · Issue #24724 · huggingface/transformers (original) (raw)
System Info
transformers
version: 4.29.0- Platform: Linux-3.10.0-1160.92.1.el7.x86_64-x86_64-with-glibc2.31
- Python version: 3.10.9
- Huggingface_hub version: 0.15.1
- Safetensors version: 0.3.1
- PyTorch version (GPU?): 2.0.1+cu117 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?:
- Using distributed or parallel set-up in script?:
Who can help?
No response
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examples
folder (such as GLUE/SQuAD, ...) - My own task or dataset (give details below)
Reproduction
##Here is my code.
import os
import logging
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence
import torch
import transformers
from datasets import load_dataset, load_from_disk
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
Trainer,
DataCollatorForSeq2Seq,
)
IGNORE_INDEX = -100
PROMPT_DICT = {
"prompt_input": (
"### 指令:\n{instruction}\n\n### 输入:\n{input}\n\n### 回答:"
),
"prompt_no_input": (
"### 指令:\n{instruction}\n\n### 回答:"
),
}
@dataclass
class TrainingArguments(transformers.TrainingArguments):
model_name_or_path: Optional[str] = field(default=None, metadata={"help": "模型名称"})
cache_dir: Optional[str] = field(default=None, metadata={"help": "模型地址"})
data_path: str = field(default=None, metadata={"help": "数据地址"})
mask_input: bool = field(default=True, metadata={"help": "是否遮掉指令,只计算回答的损失"})
model_max_length: int = field(default=512, metadata={"help": "最大序列长度"})
optim: str = field(default="adamw_torch", metadata={"help": "优化器"})
@dataclass
class DataCollatorForSupervisedDataset(object):
"""Collate examples for supervised fine-tuning."""
tokenizer: transformers.PreTrainedTokenizer
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
input_ids, labels = tuple([torch.tensor(instance[key]) for instance in instances]
for key in ("input_ids", "labels"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
)
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
return dict(
input_ids=input_ids,
labels=labels,
attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
)
def train():
local_rank = int(os.environ["LOCAL_RANK"])
parser = transformers.HfArgumentParser(TrainingArguments)
training_args, = parser.parse_args_into_dataclasses()
if local_rank == 0:
print(training_args)
tokenizer = AutoTokenizer.from_pretrained(
training_args.model_name_or_path,
cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length,
padding_side="right"
)
model = AutoModelForCausalLM.from_pretrained(
training_args.model_name_or_path,
cache_dir=training_args.cache_dir,
# torch_dtype=torch.float16
)
def generate_and_tokenize(sample):
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
source = prompt_input.format_map(sample) if sample.get("input", "") != "" \
else prompt_no_input.format_map(sample)
target = f"\n{sample['output']}{tokenizer.eos_token}"
complete = source + target
# </s> 1 2 3 : a b </s>
complete_tokenized = tokenizer(complete,
truncation=True,
max_length=training_args.model_max_length)
# </s> 1 2 3 :
source_tokenized = tokenizer(source,
truncation=True,
max_length=training_args.model_max_length)
if training_args.mask_input:
source_len = len(source_tokenized['input_ids'])
complete_tokenized['labels'] = [IGNORE_INDEX] * source_len + complete_tokenized['input_ids'][source_len:]
else:
complete_tokenized['labels'] = complete_tokenized['input_ids'].copy()
return complete_tokenized
tokenized_path = os.path.join(os.path.dirname(training_args.data_path),
f"{training_args.model_name_or_path.split('/')[-1]}_tokenized")
if not os.path.exists(tokenized_path):
logging.warning("tokenized data not existed, tokenize data...")
data = load_dataset("json", data_files=training_args.data_path)
train_dataset = data['train'].shuffle().map(generate_and_tokenize,
batched=False,
remove_columns=["instruction", "input", "output"])
if local_rank == 0:
train_dataset.save_to_disk(tokenized_path)
else:
logging.warning("tokenized data existed, load data...")
train_dataset = load_from_disk(tokenized_path)
# data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer,
label_pad_token_id=IGNORE_INDEX,
pad_to_multiple_of=8)
logging.warning("training...")
trainer = Trainer(model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_dataset,
eval_dataset=None,
data_collator=data_collator)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)
tokenizer.save_pretrained(save_directory=training_args.output_dir)
if __name__ == '__main__':
train()
Expected behavior
Has anyone encountered this problem? I used the same instruction fine-tuning code. It runs successfully with transformers package version 4.29.0, but when I upgrade to version 4.30.2, it fails to run and throws an OOM (Out of Memory) error. Does anyone know the reason behind this?