\n\nNow create a batch of examples using DataCollatorForLanguageModeling. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. Use the end-of-sequence token as the padding token and set mlm=False. This will use the inputs as labels shifted to the right by one element:
\n\nfrom transformers import DataCollatorForLanguageModeling\n\ntokenizer.pad_token = tokenizer.eos_token\ndata_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n
However, if I use DataCollatorForLanguageModeling, I get the error:
\nValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).\n
\nEnvironment
\n!cat /etc/os-release\nPRETTY_NAME=\"Debian GNU/Linux 10 (buster)\"\n\n!transformers-cli env\n- `transformers` version: 4.28.0\n- Platform: Linux-4.14.309-231.529.amzn2.x86_64-x86_64-with-debian-10.6\n- Python version: 3.7.10\n- Huggingface_hub version: 0.13.4\n- Safetensors version: not installed\n- PyTorch version (GPU?): 1.13.1+cu117 (True)\n- Tensorflow version (GPU?): not installed (NA)\n- Flax version (CPU?/GPU?/TPU?): not installed (NA)\n- Jax version: not installed\n- JaxLib version: not installed\n- Using GPU in script?: YES\n- Using distributed or parallel set-up in script?: <fill in>\n
\nCode for Tokenization
\nDATASET_STREAMING: bool = False\ntrain = load_dataset(\"xsum\", split=\"train\", streaming=DATASET_STREAMING)\n\n# --------------------------------------------------------------------------------\n# Function to generate prompt from XSUM dataset\n# --------------------------------------------------------------------------------\ndef get_convert_to_prompt(template: Template) -> Callable:\n def _convert_to_prompt(example: Dict[str, str]) -> Dict[str, str]:\n \"\"\"Generate prompt as a dictionary:\n {\n \"prompt\": \"Summarize: <document>\\n<summary>\"\n }\n\n Args:\n example: single {document, summary} pair to be able to apply template\n Returns: a dictionary of prompt\n \"\"\"\n # assert isinstance(example, dict), f\"expected dict but {type(example)}.\\n{example}\"\n assert isinstance(example['document'], str), f\"expected str but {type(example['document'])}.\"\n\n prompt, response = template.apply(example=example, truncate=False)\n return {\n \"prompt\": \" \".join(\n re.sub(r'[\\s\\'\\\"]+', ' ', prompt).split(' ')[:MAX_REQUEST_LENGTH-1] # -1 for \\n\n ) + \"\\n\" + \" \".join(\n re.sub(r'[\\s\\'\\\"]+', ' ', response).split(' ')[:MAX_RESPONSE_LENGTH-1]\n ) + \"\\n\"\n }\n\n return _convert_to_prompt\n\nconvert_to_prompt: Callable = get_convert_to_prompt(template=template)\n\n# --------------------------------------------------------------------------------\n# Function to tokenize prompt\n# --------------------------------------------------------------------------------\ndef tokenize_prompt(example):\n \"\"\"Generate the model inputs in the dictionary with format:\n {\n \"input_ids\": List[int], \n \"attention_mask\": List[int]\",\n \"labels\": List[int]\n }\n \n Args:\n example: a dictionary of format {\n \"prompt\": \"Summarize:<document>\\n<summary>\\n\",\n }\n \"\"\" \n assert isinstance(example['prompt'], str), f\"expected str, got {type(example['prompt'])}\"\n inputs: Dict[str, List[int]] = tokenizer(\n example['prompt'], \n max_length=MAX_TOKEN_LENGTH, \n truncation=True,\n # padding='max_length',\n )\n inputs[\"labels\"] = inputs[\"input_ids\"].copy() # Casual LM get the same tokens as inputs and label\n \n return inputs\n\nremove_column_names: List[str] = list(train.features.keys())\n\n# --------------------------------------------------------------------------------\n# Tokenization by applying function\n# --------------------------------------------------------------------------------\ntokenized_train = train.map(\n function=convert_to_prompt, \n batched=False,\n remove_columns=remove_column_names,\n num_proc=NUM_CPUS\n).map(\n function=tokenize_prompt, \n batched=False,\n remove_columns=['prompt'],\n num_proc=NUM_CPUS\n).shuffle(\n seed=42\n).with_format(\n \"torch\"\n)\n
\nTraining:
\ndata_collator = DataCollatorForLanguageModeling(\n tokenizer=tokenizer, \n mlm=False,\n return_tensors='pt'\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"bloom_finetuned\",\n max_steps=MAX_STEPS,\n num_train_epochs=3,\n per_device_train_batch_size=1,\n# per_device_eval_batch_size=1,\n learning_rate=2e-5,\n weight_decay=0.01, \n fp16=USE_FLOAT16,\n no_cuda=False,\n# evaluation_strategy=\"epoch\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=tokenized_train,\n tokenizer=tokenizer,\n data_collator=data_collator,\n compute_metrics=compute_metrics,\n)\n
\n","updatedAt":"2023-04-17T07:16:06.569Z","author":{"_id":"60d5a4ea5e57527c0e86a2b2","avatarUrl":"/avatars/3a1071943173ffa301ebdc098fc72652.svg","fullname":"mon","name":"monta","type":"user","isPro":false,"isHf":false,"isHfAdmin":false,"isMod":false}},"numEdits":0,"editors":["monta"],"editorAvatarUrls":["/avatars/3a1071943173ffa301ebdc098fc72652.svg"],"reactions":[],"isReport":false}}],"pinned":false,"locked":false,"collection":"discussions","isPullRequest":false,"isReport":false},"repo":{"name":"bigscience/bloom","type":"model"},"activeTab":"discussion","discussionRole":0,"watched":false,"muted":false,"repoDiscussionsLocked":false}">Data Collator class to use for BLOOM
\n\nNow create a batch of examples using DataCollatorForLanguageModeling. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. Use the end-of-sequence token as the padding token and set mlm=False. This will use the inputs as labels shifted to the right by one element:
\n\nfrom transformers import DataCollatorForLanguageModeling\n\ntokenizer.pad_token = tokenizer.eos_token\ndata_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n
However, if I use DataCollatorForLanguageModeling, I get the error:
\nValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).\n
\nEnvironment
\n!cat /etc/os-release\nPRETTY_NAME=\"Debian GNU/Linux 10 (buster)\"\n\n!transformers-cli env\n- `transformers` version: 4.28.0\n- Platform: Linux-4.14.309-231.529.amzn2.x86_64-x86_64-with-debian-10.6\n- Python version: 3.7.10\n- Huggingface_hub version: 0.13.4\n- Safetensors version: not installed\n- PyTorch version (GPU?): 1.13.1+cu117 (True)\n- Tensorflow version (GPU?): not installed (NA)\n- Flax version (CPU?/GPU?/TPU?): not installed (NA)\n- Jax version: not installed\n- JaxLib version: not installed\n- Using GPU in script?: YES\n- Using distributed or parallel set-up in script?: <fill in>\n
\nCode for Tokenization
\nDATASET_STREAMING: bool = False\ntrain = load_dataset(\"xsum\", split=\"train\", streaming=DATASET_STREAMING)\n\n# --------------------------------------------------------------------------------\n# Function to generate prompt from XSUM dataset\n# --------------------------------------------------------------------------------\ndef get_convert_to_prompt(template: Template) -> Callable:\n def _convert_to_prompt(example: Dict[str, str]) -> Dict[str, str]:\n \"\"\"Generate prompt as a dictionary:\n {\n \"prompt\": \"Summarize: <document>\\n<summary>\"\n }\n\n Args:\n example: single {document, summary} pair to be able to apply template\n Returns: a dictionary of prompt\n \"\"\"\n # assert isinstance(example, dict), f\"expected dict but {type(example)}.\\n{example}\"\n assert isinstance(example['document'], str), f\"expected str but {type(example['document'])}.\"\n\n prompt, response = template.apply(example=example, truncate=False)\n return {\n \"prompt\": \" \".join(\n re.sub(r'[\\s\\'\\\"]+', ' ', prompt).split(' ')[:MAX_REQUEST_LENGTH-1] # -1 for \\n\n ) + \"\\n\" + \" \".join(\n re.sub(r'[\\s\\'\\\"]+', ' ', response).split(' ')[:MAX_RESPONSE_LENGTH-1]\n ) + \"\\n\"\n }\n\n return _convert_to_prompt\n\nconvert_to_prompt: Callable = get_convert_to_prompt(template=template)\n\n# --------------------------------------------------------------------------------\n# Function to tokenize prompt\n# --------------------------------------------------------------------------------\ndef tokenize_prompt(example):\n \"\"\"Generate the model inputs in the dictionary with format:\n {\n \"input_ids\": List[int], \n \"attention_mask\": List[int]\",\n \"labels\": List[int]\n }\n \n Args:\n example: a dictionary of format {\n \"prompt\": \"Summarize:<document>\\n<summary>\\n\",\n }\n \"\"\" \n assert isinstance(example['prompt'], str), f\"expected str, got {type(example['prompt'])}\"\n inputs: Dict[str, List[int]] = tokenizer(\n example['prompt'], \n max_length=MAX_TOKEN_LENGTH, \n truncation=True,\n # padding='max_length',\n )\n inputs[\"labels\"] = inputs[\"input_ids\"].copy() # Casual LM get the same tokens as inputs and label\n \n return inputs\n\nremove_column_names: List[str] = list(train.features.keys())\n\n# --------------------------------------------------------------------------------\n# Tokenization by applying function\n# --------------------------------------------------------------------------------\ntokenized_train = train.map(\n function=convert_to_prompt, \n batched=False,\n remove_columns=remove_column_names,\n num_proc=NUM_CPUS\n).map(\n function=tokenize_prompt, \n batched=False,\n remove_columns=['prompt'],\n num_proc=NUM_CPUS\n).shuffle(\n seed=42\n).with_format(\n \"torch\"\n)\n
\nTraining:
\ndata_collator = DataCollatorForLanguageModeling(\n tokenizer=tokenizer, \n mlm=False,\n return_tensors='pt'\n)\n\ntraining_args = TrainingArguments(\n output_dir=\"bloom_finetuned\",\n max_steps=MAX_STEPS,\n num_train_epochs=3,\n per_device_train_batch_size=1,\n# per_device_eval_batch_size=1,\n learning_rate=2e-5,\n weight_decay=0.01, \n fp16=USE_FLOAT16,\n no_cuda=False,\n# evaluation_strategy=\"epoch\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=tokenized_train,\n tokenizer=tokenizer,\n data_collator=data_collator,\n compute_metrics=compute_metrics,\n)\n
\n","updatedAt":"2023-04-17T07:16:06.569Z","author":{"_id":"60d5a4ea5e57527c0e86a2b2","avatarUrl":"/avatars/3a1071943173ffa301ebdc098fc72652.svg","fullname":"mon","name":"monta","type":"user","isPro":false,"isHf":false,"isHfAdmin":false,"isMod":false}},"numEdits":0,"editors":["monta"],"editorAvatarUrls":["/avatars/3a1071943173ffa301ebdc098fc72652.svg"],"reactions":[],"isReport":false}}],"pinned":false,"locked":false,"collection":"discussions","isPullRequest":false,"isReport":false},"primaryEmailConfirmed":false,"repo":{"name":"bigscience/bloom","type":"model"},"discussionRole":0,"acceptLanguages":["*"],"hideComments":true,"repoDiscussionsLocked":false,"isDiscussionAuthor":false}">Do we need to use DataCollatorForLanguageModeling and EOS (End of Sequence) token for padding token for BLOOM?
Causal language modeling says:
Now create a batch of examples using DataCollatorForLanguageModeling. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. Use the end-of-sequence token as the padding token and set mlm=False. This will use the inputs as labels shifted to the right by one element:
from transformers import DataCollatorForLanguageModeling tokenizer.pad_token = tokenizer.eos_token data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
However, if I use DataCollatorForLanguageModeling, I get the error:
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Environment
!cat /etc/os-release
PRETTY_NAME="Debian GNU/Linux 10 (buster)"
!transformers-cli env
- `transformers` version: 4.28.0
- Platform: Linux-4.14.309-231.529.amzn2.x86_64-x86_64-with-debian-10.6
- Python version: 3.7.10
- Huggingface_hub version: 0.13.4
- Safetensors version: not installed
- PyTorch version (GPU?): 1.13.1+cu117 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: YES
- Using distributed or parallel set-up in script?: <fill in>
Code for Tokenization
DATASET_STREAMING: bool = False
train = load_dataset("xsum", split="train", streaming=DATASET_STREAMING)
# --------------------------------------------------------------------------------
# Function to generate prompt from XSUM dataset
# --------------------------------------------------------------------------------
def get_convert_to_prompt(template: Template) -> Callable:
def _convert_to_prompt(example: Dict[str, str]) -> Dict[str, str]:
"""Generate prompt as a dictionary:
{
"prompt": "Summarize: <document>\n<summary>"
}
Args:
example: single {document, summary} pair to be able to apply template
Returns: a dictionary of prompt
"""
# assert isinstance(example, dict), f"expected dict but {type(example)}.\n{example}"
assert isinstance(example['document'], str), f"expected str but {type(example['document'])}."
prompt, response = template.apply(example=example, truncate=False)
return {
"prompt": " ".join(
re.sub(r'[\s\'\"]+', ' ', prompt).split(' ')[:MAX_REQUEST_LENGTH-1] # -1 for \n
) + "\n" + " ".join(
re.sub(r'[\s\'\"]+', ' ', response).split(' ')[:MAX_RESPONSE_LENGTH-1]
) + "\n"
}
return _convert_to_prompt
convert_to_prompt: Callable = get_convert_to_prompt(template=template)
# --------------------------------------------------------------------------------
# Function to tokenize prompt
# --------------------------------------------------------------------------------
def tokenize_prompt(example):
"""Generate the model inputs in the dictionary with format:
{
"input_ids": List[int],
"attention_mask": List[int]",
"labels": List[int]
}
Args:
example: a dictionary of format {
"prompt": "Summarize:<document>\n<summary>\n",
}
"""
assert isinstance(example['prompt'], str), f"expected str, got {type(example['prompt'])}"
inputs: Dict[str, List[int]] = tokenizer(
example['prompt'],
max_length=MAX_TOKEN_LENGTH,
truncation=True,
# padding='max_length',
)
inputs["labels"] = inputs["input_ids"].copy() # Casual LM get the same tokens as inputs and label
return inputs
remove_column_names: List[str] = list(train.features.keys())
# --------------------------------------------------------------------------------
# Tokenization by applying function
# --------------------------------------------------------------------------------
tokenized_train = train.map(
function=convert_to_prompt,
batched=False,
remove_columns=remove_column_names,
num_proc=NUM_CPUS
).map(
function=tokenize_prompt,
batched=False,
remove_columns=['prompt'],
num_proc=NUM_CPUS
).shuffle(
seed=42
).with_format(
"torch"
)
Training:
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False,
return_tensors='pt'
)
training_args = TrainingArguments(
output_dir="bloom_finetuned",
max_steps=MAX_STEPS,
num_train_epochs=3,
per_device_train_batch_size=1,
# per_device_eval_batch_size=1,
learning_rate=2e-5,
weight_decay=0.01,
fp16=USE_FLOAT16,
no_cuda=False,
# evaluation_strategy="epoch",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)