lynx   »   [go: up one dir, main page]

Text Generation
Transformers
PyTorch
TensorBoard
Safetensors
bloom
Eval Results
text-generation-inference
Causal language modeling says:

\n
\n

Now create a batch of examples using DataCollatorForLanguageModeling. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. Use the end-of-sequence token as the padding token and set mlm=False. This will use the inputs as labels shifted to the right by one element:

\n
from transformers import DataCollatorForLanguageModeling\n\ntokenizer.pad_token = tokenizer.eos_token\ndata_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n
\n
\n

However, if I use DataCollatorForLanguageModeling, I get the error:

\n
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).\n
\n

Environment

\n
!cat /etc/os-release\nPRETTY_NAME=\"Debian GNU/Linux 10 (buster)\"\n\n!transformers-cli env\n- `transformers` version: 4.28.0\n- Platform: Linux-4.14.309-231.529.amzn2.x86_64-x86_64-with-debian-10.6\n- Python version: 3.7.10\n- Huggingface_hub version: 0.13.4\n- Safetensors version: not installed\n- PyTorch version (GPU?): 1.13.1+cu117 (True)\n- Tensorflow version (GPU?): not installed (NA)\n- Flax version (CPU?/GPU?/TPU?): not installed (NA)\n- Jax version: not installed\n- JaxLib version: not installed\n- Using GPU in script?: YES\n- Using distributed or parallel set-up in script?: <fill in>\n
\n

Code for Tokenization

\n
DATASET_STREAMING: bool = False\ntrain = load_dataset(\"xsum\", split=\"train\", streaming=DATASET_STREAMING)\n\n# --------------------------------------------------------------------------------\n# Function to generate prompt from XSUM dataset\n# --------------------------------------------------------------------------------\ndef get_convert_to_prompt(template: Template) -> Callable:\n    def _convert_to_prompt(example: Dict[str, str]) -> Dict[str, str]:\n        \"\"\"Generate prompt as a dictionary:\n        {\n            \"prompt\": \"Summarize: <document>\\n<summary>\"\n        }\n\n        Args:\n            example: single {document, summary} pair to be able to apply template\n        Returns: a dictionary of prompt\n        \"\"\"\n        # assert isinstance(example, dict), f\"expected dict but {type(example)}.\\n{example}\"\n        assert isinstance(example['document'], str), f\"expected str but {type(example['document'])}.\"\n\n        prompt, response = template.apply(example=example, truncate=False)\n        return {\n            \"prompt\": \" \".join(\n                re.sub(r'[\\s\\'\\\"]+', ' ', prompt).split(' ')[:MAX_REQUEST_LENGTH-1]  # -1 for \\n\n            ) + \"\\n\" + \" \".join(\n                re.sub(r'[\\s\\'\\\"]+', ' ', response).split(' ')[:MAX_RESPONSE_LENGTH-1]\n            ) + \"\\n\"\n        }\n\n    return _convert_to_prompt\n\nconvert_to_prompt: Callable = get_convert_to_prompt(template=template)\n\n# --------------------------------------------------------------------------------\n# Function to tokenize prompt\n# --------------------------------------------------------------------------------\ndef tokenize_prompt(example):\n    \"\"\"Generate the model inputs in the dictionary with format:\n    {\n        \"input_ids\": List[int], \n        \"attention_mask\": List[int]\",\n        \"labels\": List[int]\n    }\n    \n    Args:\n        example:   a dictionary of format {\n            \"prompt\": \"Summarize:<document>\\n<summary>\\n\",\n        }\n    \"\"\"    \n    assert isinstance(example['prompt'], str), f\"expected str, got {type(example['prompt'])}\"\n    inputs: Dict[str, List[int]] = tokenizer(\n        example['prompt'], \n        max_length=MAX_TOKEN_LENGTH,   \n        truncation=True,\n        # padding='max_length',\n    )\n    inputs[\"labels\"] = inputs[\"input_ids\"].copy()   # Casual LM get the same tokens as inputs and label\n    \n    return inputs\n\nremove_column_names: List[str] = list(train.features.keys())\n\n# --------------------------------------------------------------------------------\n# Tokenization by applying function\n# --------------------------------------------------------------------------------\ntokenized_train = train.map(\n    function=convert_to_prompt, \n    batched=False,\n    remove_columns=remove_column_names,\n    num_proc=NUM_CPUS\n).map(\n    function=tokenize_prompt, \n    batched=False,\n    remove_columns=['prompt'],\n    num_proc=NUM_CPUS\n).shuffle(\n    seed=42\n).with_format(\n    \"torch\"\n)\n
\n

Training:

\n
data_collator = DataCollatorForLanguageModeling(\n   tokenizer=tokenizer, \n   mlm=False,\n   return_tensors='pt'\n)\n\ntraining_args = TrainingArguments(\n    output_dir=\"bloom_finetuned\",\n    max_steps=MAX_STEPS,\n    num_train_epochs=3,\n    per_device_train_batch_size=1,\n#    per_device_eval_batch_size=1,\n    learning_rate=2e-5,\n    weight_decay=0.01, \n    fp16=USE_FLOAT16,\n    no_cuda=False,\n#    evaluation_strategy=\"epoch\",\n)\n\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=tokenized_train,\n    tokenizer=tokenizer,\n    data_collator=data_collator,\n    compute_metrics=compute_metrics,\n)\n
\n","updatedAt":"2023-04-17T07:16:06.569Z","author":{"_id":"60d5a4ea5e57527c0e86a2b2","avatarUrl":"/avatars/3a1071943173ffa301ebdc098fc72652.svg","fullname":"mon","name":"monta","type":"user","isPro":false,"isHf":false,"isHfAdmin":false,"isMod":false}},"numEdits":0,"editors":["monta"],"editorAvatarUrls":["/avatars/3a1071943173ffa301ebdc098fc72652.svg"],"reactions":[],"isReport":false}}],"pinned":false,"locked":false,"collection":"discussions","isPullRequest":false,"isReport":false},"repo":{"name":"bigscience/bloom","type":"model"},"activeTab":"discussion","discussionRole":0,"watched":false,"muted":false,"repoDiscussionsLocked":false}">

Data Collator class to use for BLOOM

#238
by monta - opened
Causal language modeling says:

\n
\n

Now create a batch of examples using DataCollatorForLanguageModeling. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. Use the end-of-sequence token as the padding token and set mlm=False. This will use the inputs as labels shifted to the right by one element:

\n
from transformers import DataCollatorForLanguageModeling\n\ntokenizer.pad_token = tokenizer.eos_token\ndata_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)\n
\n
\n

However, if I use DataCollatorForLanguageModeling, I get the error:

\n
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).\n
\n

Environment

\n
!cat /etc/os-release\nPRETTY_NAME=\"Debian GNU/Linux 10 (buster)\"\n\n!transformers-cli env\n- `transformers` version: 4.28.0\n- Platform: Linux-4.14.309-231.529.amzn2.x86_64-x86_64-with-debian-10.6\n- Python version: 3.7.10\n- Huggingface_hub version: 0.13.4\n- Safetensors version: not installed\n- PyTorch version (GPU?): 1.13.1+cu117 (True)\n- Tensorflow version (GPU?): not installed (NA)\n- Flax version (CPU?/GPU?/TPU?): not installed (NA)\n- Jax version: not installed\n- JaxLib version: not installed\n- Using GPU in script?: YES\n- Using distributed or parallel set-up in script?: <fill in>\n
\n

Code for Tokenization

\n
DATASET_STREAMING: bool = False\ntrain = load_dataset(\"xsum\", split=\"train\", streaming=DATASET_STREAMING)\n\n# --------------------------------------------------------------------------------\n# Function to generate prompt from XSUM dataset\n# --------------------------------------------------------------------------------\ndef get_convert_to_prompt(template: Template) -> Callable:\n    def _convert_to_prompt(example: Dict[str, str]) -> Dict[str, str]:\n        \"\"\"Generate prompt as a dictionary:\n        {\n            \"prompt\": \"Summarize: <document>\\n<summary>\"\n        }\n\n        Args:\n            example: single {document, summary} pair to be able to apply template\n        Returns: a dictionary of prompt\n        \"\"\"\n        # assert isinstance(example, dict), f\"expected dict but {type(example)}.\\n{example}\"\n        assert isinstance(example['document'], str), f\"expected str but {type(example['document'])}.\"\n\n        prompt, response = template.apply(example=example, truncate=False)\n        return {\n            \"prompt\": \" \".join(\n                re.sub(r'[\\s\\'\\\"]+', ' ', prompt).split(' ')[:MAX_REQUEST_LENGTH-1]  # -1 for \\n\n            ) + \"\\n\" + \" \".join(\n                re.sub(r'[\\s\\'\\\"]+', ' ', response).split(' ')[:MAX_RESPONSE_LENGTH-1]\n            ) + \"\\n\"\n        }\n\n    return _convert_to_prompt\n\nconvert_to_prompt: Callable = get_convert_to_prompt(template=template)\n\n# --------------------------------------------------------------------------------\n# Function to tokenize prompt\n# --------------------------------------------------------------------------------\ndef tokenize_prompt(example):\n    \"\"\"Generate the model inputs in the dictionary with format:\n    {\n        \"input_ids\": List[int], \n        \"attention_mask\": List[int]\",\n        \"labels\": List[int]\n    }\n    \n    Args:\n        example:   a dictionary of format {\n            \"prompt\": \"Summarize:<document>\\n<summary>\\n\",\n        }\n    \"\"\"    \n    assert isinstance(example['prompt'], str), f\"expected str, got {type(example['prompt'])}\"\n    inputs: Dict[str, List[int]] = tokenizer(\n        example['prompt'], \n        max_length=MAX_TOKEN_LENGTH,   \n        truncation=True,\n        # padding='max_length',\n    )\n    inputs[\"labels\"] = inputs[\"input_ids\"].copy()   # Casual LM get the same tokens as inputs and label\n    \n    return inputs\n\nremove_column_names: List[str] = list(train.features.keys())\n\n# --------------------------------------------------------------------------------\n# Tokenization by applying function\n# --------------------------------------------------------------------------------\ntokenized_train = train.map(\n    function=convert_to_prompt, \n    batched=False,\n    remove_columns=remove_column_names,\n    num_proc=NUM_CPUS\n).map(\n    function=tokenize_prompt, \n    batched=False,\n    remove_columns=['prompt'],\n    num_proc=NUM_CPUS\n).shuffle(\n    seed=42\n).with_format(\n    \"torch\"\n)\n
\n

Training:

\n
data_collator = DataCollatorForLanguageModeling(\n   tokenizer=tokenizer, \n   mlm=False,\n   return_tensors='pt'\n)\n\ntraining_args = TrainingArguments(\n    output_dir=\"bloom_finetuned\",\n    max_steps=MAX_STEPS,\n    num_train_epochs=3,\n    per_device_train_batch_size=1,\n#    per_device_eval_batch_size=1,\n    learning_rate=2e-5,\n    weight_decay=0.01, \n    fp16=USE_FLOAT16,\n    no_cuda=False,\n#    evaluation_strategy=\"epoch\",\n)\n\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=tokenized_train,\n    tokenizer=tokenizer,\n    data_collator=data_collator,\n    compute_metrics=compute_metrics,\n)\n
\n","updatedAt":"2023-04-17T07:16:06.569Z","author":{"_id":"60d5a4ea5e57527c0e86a2b2","avatarUrl":"/avatars/3a1071943173ffa301ebdc098fc72652.svg","fullname":"mon","name":"monta","type":"user","isPro":false,"isHf":false,"isHfAdmin":false,"isMod":false}},"numEdits":0,"editors":["monta"],"editorAvatarUrls":["/avatars/3a1071943173ffa301ebdc098fc72652.svg"],"reactions":[],"isReport":false}}],"pinned":false,"locked":false,"collection":"discussions","isPullRequest":false,"isReport":false},"primaryEmailConfirmed":false,"repo":{"name":"bigscience/bloom","type":"model"},"discussionRole":0,"acceptLanguages":["*"],"hideComments":true,"repoDiscussionsLocked":false,"isDiscussionAuthor":false}">

Do we need to use DataCollatorForLanguageModeling and EOS (End of Sequence) token for padding token for BLOOM?

Causal language modeling says:

Now create a batch of examples using DataCollatorForLanguageModeling. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. Use the end-of-sequence token as the padding token and set mlm=False. This will use the inputs as labels shifted to the right by one element:

from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

However, if I use DataCollatorForLanguageModeling, I get the error:

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

Environment

!cat /etc/os-release
PRETTY_NAME="Debian GNU/Linux 10 (buster)"

!transformers-cli env
- `transformers` version: 4.28.0
- Platform: Linux-4.14.309-231.529.amzn2.x86_64-x86_64-with-debian-10.6
- Python version: 3.7.10
- Huggingface_hub version: 0.13.4
- Safetensors version: not installed
- PyTorch version (GPU?): 1.13.1+cu117 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: YES
- Using distributed or parallel set-up in script?: <fill in>

Code for Tokenization

DATASET_STREAMING: bool = False
train = load_dataset("xsum", split="train", streaming=DATASET_STREAMING)

# --------------------------------------------------------------------------------
# Function to generate prompt from XSUM dataset
# --------------------------------------------------------------------------------
def get_convert_to_prompt(template: Template) -> Callable:
    def _convert_to_prompt(example: Dict[str, str]) -> Dict[str, str]:
        """Generate prompt as a dictionary:
        {
            "prompt": "Summarize: <document>\n<summary>"
        }

        Args:
            example: single {document, summary} pair to be able to apply template
        Returns: a dictionary of prompt
        """
        # assert isinstance(example, dict), f"expected dict but {type(example)}.\n{example}"
        assert isinstance(example['document'], str), f"expected str but {type(example['document'])}."

        prompt, response = template.apply(example=example, truncate=False)
        return {
            "prompt": " ".join(
                re.sub(r'[\s\'\"]+', ' ', prompt).split(' ')[:MAX_REQUEST_LENGTH-1]  # -1 for \n
            ) + "\n" + " ".join(
                re.sub(r'[\s\'\"]+', ' ', response).split(' ')[:MAX_RESPONSE_LENGTH-1]
            ) + "\n"
        }

    return _convert_to_prompt

convert_to_prompt: Callable = get_convert_to_prompt(template=template)

# --------------------------------------------------------------------------------
# Function to tokenize prompt
# --------------------------------------------------------------------------------
def tokenize_prompt(example):
    """Generate the model inputs in the dictionary with format:
    {
        "input_ids": List[int], 
        "attention_mask": List[int]",
        "labels": List[int]
    }
    
    Args:
        example:   a dictionary of format {
            "prompt": "Summarize:<document>\n<summary>\n",
        }
    """    
    assert isinstance(example['prompt'], str), f"expected str, got {type(example['prompt'])}"
    inputs: Dict[str, List[int]] = tokenizer(
        example['prompt'], 
        max_length=MAX_TOKEN_LENGTH,   
        truncation=True,
        # padding='max_length',
    )
    inputs["labels"] = inputs["input_ids"].copy()   # Casual LM get the same tokens as inputs and label
    
    return inputs

remove_column_names: List[str] = list(train.features.keys())

# --------------------------------------------------------------------------------
# Tokenization by applying function
# --------------------------------------------------------------------------------
tokenized_train = train.map(
    function=convert_to_prompt, 
    batched=False,
    remove_columns=remove_column_names,
    num_proc=NUM_CPUS
).map(
    function=tokenize_prompt, 
    batched=False,
    remove_columns=['prompt'],
    num_proc=NUM_CPUS
).shuffle(
    seed=42
).with_format(
    "torch"
)

Training:

data_collator = DataCollatorForLanguageModeling(
   tokenizer=tokenizer, 
   mlm=False,
   return_tensors='pt'
)

training_args = TrainingArguments(
    output_dir="bloom_finetuned",
    max_steps=MAX_STEPS,
    num_train_epochs=3,
    per_device_train_batch_size=1,
#    per_device_eval_batch_size=1,
    learning_rate=2e-5,
    weight_decay=0.01, 
    fp16=USE_FLOAT16,
    no_cuda=False,
#    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Sign up or log in to comment

Лучший частный хостинг