Merge pull request #21 from facebookresearch/tuan/fix_17

Fix data schema in example evaluation script
facebookresearch · Jan 16, 2025 · d640223 · d640223
2 parents 67b7ccc + 482e7d1
commit d640223
Show file tree

Hide file tree

Showing 6 changed files with 62 additions and 29 deletions.
diff --git a/examples/evaluation/README.md b/examples/evaluation/README.md
@@ -21,7 +21,7 @@ Next, we download and parse the content (source text and summaries), saving diff
 ```shell
 python prepare_evaluation_data.py prepare_data \
     --dataset_name=cnn_dailymail \
-    --output_dir=jsonl_dataset/cnn_dailymail \
+    --output_dir=jsonl_dataset \
     --source_text_column=article \
     --target_text_column=highlights \
     --version=3.0.0 \
@@ -31,6 +31,8 @@ python prepare_evaluation_data.py prepare_data \
 
 Explain: In the above script, `cnn_dailymail` and `3.0.0` is the name and configuration of the dataset as available in HuggingFace `datasets`, `article` and `highlights` are source and summary columns. The `prompt_prefix` and `prompt_suffix` are optional arguments, if specified they will be prepended and appended to each source text to form the complete prompt. These arguments are useful if you want to embed the prompts into the dataset, and let them process all at once together with the text. Alternatively, we can specify them at later phase, when we evaluate the model (in which case the model will process the prompts on the fly)
 
+> **_NOTE:_**  When `prompt_prefix` or `prompt_suffix` are specified, the dataset schema will change, i.e. the columns are renamed to "prompt" for input and "answer" for output. This is to indicate that we are handling the "processed" dataset and not the original one.
+
 The output will be stored in different files `[split].jsonl` under the directory `output_dir`. 
 
 
@@ -41,6 +43,8 @@ To perform sentence splitting and sonar embedding for each split, run the follow
 ```shell
 python prepare_evaluation_data.py embed \
     --input_path=jsonl_dataset/cnn_dailymail/test.jsonl \
+    --input_column=article \
+    --output_column=highlights \
     --output_dir=parquet_dataset/cnn_dailymail \
     --lang=eng_Latn \
     --mode=slurm \
@@ -92,12 +96,16 @@ uv run torchrun --standalone --nnodes=1 --nproc-per-node=1 -m lcm.evaluation \
   --task_args '{"max_gen_len": 200}' \
   --dataset_dir jsonl_dataset/cnn_dailymail \
   --data_loading.batch_size 16 \
+  --dataset.soure_text_column prompt \
+  --dataset.source_target_column answer \
   --dump_dir output_results
 ```
 
 In the example above, we load the model "meta-llama/Llama-3.1-8B-Instruct" as [specified](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) in HuggingFace, evaluate it on the CNN dailymail in which we process using the `prepare_evaluation_data.py` script as in Step 1.1, and store the results in the folder specified via `dump_dir`. The argument `dataset_dir` refers to the value of the argument `output_dir` in Step 1.1.
 
-You can also customize the prompt used to evaluate the LLM for each evaluation run. To do this, instead of specifying the `prompt_prefix` and `prompt_suffix` when preparing the data (as shown in the example in Section 1.1), we specify `dataset.source_prefix_text` and `dataset.source_suffix_text` during the evaluation run:
+In some cases, the model requires authentication token to evaluate. You can obtain them in HuggingGface (see [User Access Tokens](https://huggingface.co/docs/hub/en/security-tokens)), then add the parameter `--use_auth_token [YOUR TOKEN]` to the CLI command.
+
+In the above example, we need to provide the `source_text_column` and `source_target_column` parameters, because in Step 1, we inject the prompts direcly to the dataset and renamed the columns accordingly (to differentiate with "original" datasets).  You can also skip this part and customize the prompt for each for each evaluation run. To do this, instead of specifying the `prompt_prefix` and `prompt_suffix` when preparing the data (as shown in the example in Section 1.1), we specify `dataset.source_prefix_text` and `dataset.source_suffix_text` during the evaluation run:
 
 ```shell
 uv run torchrun --standalone --nnodes=1 --nproc-per-node=1 -m lcm.evaluation \
@@ -113,6 +121,8 @@ uv run torchrun --standalone --nnodes=1 --nproc-per-node=1 -m lcm.evaluation \
   --dump_dir output_results
 ```
 
+Note the missing parameters `source_text_column` and `target_text_column` and the new parameters `source_prefix_text`, `target_prefix_text`, since in this case, we do not modify the column schema, therefore the original text columns ("article", "highlights") are kept and not specified in the CLI.
+
 It is also possible to provide the prompt from a YAML file. This is handy when you have to engineer the prompts carefully and have a very long detailed text. We provide one example prompt in the file [instruction.yaml](./instruction.yaml). The example command is:
 
 ```shell

diff --git a/examples/evaluation/prepare_evaluation_data.py b/examples/evaluation/prepare_evaluation_data.py
@@ -12,6 +12,7 @@
 
 import json
 import logging
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Literal, Optional
 
@@ -48,8 +49,16 @@
 OUTPUT_KEY = "answer"
 
 
+@dataclass
+class SonarColumnRenameAndEmbedConfig(SonarTextEmbedderConfig):
+    input_column: str = INPUT_KEY
+    output_column: Optional[str] = OUTPUT_KEY
+
+
 class InstSonarEmbedder(SonarTextBatchEmbedder):
-    def __init__(self, config: SonarTextEmbedderConfig) -> None:
+    config: SonarColumnRenameAndEmbedConfig
+
+    def __init__(self, config: SonarColumnRenameAndEmbedConfig) -> None:
         super().__init__(config)
         self.sat_splitter = SaT("sat-3l")
         self.sat_splitter.to("cuda")
@@ -120,65 +129,81 @@ def resplit_long_sentences(
     def __call__(self, batch: pa.Table) -> pa.Table:
         batch = batch_to_pandas(batch)
 
-        batch[f"{OUTPUT_KEY}_sentences"] = self.split_one_single_column(
-            batch[OUTPUT_KEY]
+        batch[f"{INPUT_KEY}_sentences"] = self.split_one_single_column(
+            batch[self.config.input_column]
         )
-        # Avoid too much resplitting on the target side
-        batch[f"{OUTPUT_KEY}_sentences"] = self.resplit_long_sentences(
-            batch[f"{OUTPUT_KEY}_sentences"], max_length_char=256
-        )
-
-        batch[f"{INPUT_KEY}_sentences"] = self.split_one_single_column(batch[INPUT_KEY])
         batch[f"{INPUT_KEY}_sentences"] = self.resplit_long_sentences(
             batch[f"{INPUT_KEY}_sentences"],
             max_length_char=256,
         )
 
+        if self.config.output_column is not None:
+            batch[f"{OUTPUT_KEY}_sentences"] = self.split_one_single_column(
+                batch[self.config.output_column]
+            )
+            # Avoid too much resplitting on the target side
+            batch[f"{OUTPUT_KEY}_sentences"] = self.resplit_long_sentences(
+                batch[f"{OUTPUT_KEY}_sentences"], max_length_char=256
+            )
+
         return super().__call__(batch_to_table(batch))
 
 
 def prepare_data(
     dataset_name: str,
     output_dir: str,
-    source_text_column: str,
-    target_text_column: Optional[str] = None,
+    source_text_column: str = INPUT_KEY,
+    target_text_column: Optional[str] = OUTPUT_KEY,
     version: Optional[str] = None,
     prompt_prefix: Optional[str] = None,
     prompt_suffix: Optional[str] = None,
 ):
     """Download HuggingFace datasets and parse them into JSON format"""
     ds = datasets.load_dataset(dataset_name, version)
+
     prompt_prefix = prompt_prefix or ""
     prompt_suffix = prompt_suffix or ""
 
+    # If there is no prompt added to the dataset, we keep the original column names
+    if not prompt_prefix and not prompt_suffix:
+        source = source_text_column
+        target = target_text_column
+    else:
+        source = INPUT_KEY
+        target = OUTPUT_KEY
+
     for split in SPLITS:
         with open(
             Path(output_dir) / f"{dataset_name}/{split}.jsonl", "w", encoding="utf-8"
         ) as o:
             for item in ds[split]:
                 prompt = prompt_prefix + item[source_text_column] + prompt_suffix
                 output_item = {
-                    INPUT_KEY: prompt,
+                    source: prompt,
                     "split": split,
                     "category": f"{dataset_name}",
                 }
                 if target_text_column is not None:
-                    output_item[OUTPUT_KEY] = item[target_text_column]
+                    output_item[target] = item[target_text_column]
                 o.write(json.dumps(output_item) + "\n")
 
 
 async def embed(
     input_path: str,
     output_dir: str,
+    source_text_column: str = INPUT_KEY,
+    target_text_column: Optional[str] = OUTPUT_KEY,
     lang: str = "eng_Latn",
     mode: Literal["local", "slurm"] = "local",
     log_dir: Optional[str] = None,
 ):
-    inst_sonar_config = SonarTextEmbedderConfig(
+    inst_sonar_config = SonarColumnRenameAndEmbedConfig(
         column_config=[
             LangColumnConfig(f"{OUTPUT_KEY}_sentences", lang_value=lang),
             LangColumnConfig(f"{INPUT_KEY}_sentences", lang_value=lang),
         ],
+        input_column=source_text_column,
+        output_column=target_text_column,
         batch_size=32,
         device="cuda",
     )

diff --git a/lcm/evaluation/predictors/gemma.py b/lcm/evaluation/predictors/gemma.py
@@ -6,7 +6,6 @@
 from dataclasses import dataclass
 
 from lcm.evaluation.api import (
-    GROUND_TRUTH_COLUMN,
     PREDICTION_COLUMN,
     Example,
 )
@@ -32,7 +31,7 @@ def from_config(config: GemmaPredictorConfig, **kwargs) -> "GemmaPredictor":  #
     def post_process(self, x: Example) -> Example:
         """Handle the cleaning of response from Gemma models"""
 
-        pred = x[PREDICTION_COLUMN]
+        pred = x.pop(PREDICTION_COLUMN)
 
         # Pretrained model
         if not self.config.model_name.endswith("-it"):
@@ -56,10 +55,7 @@ def post_process(self, x: Example) -> Example:
                 colon_idx = pred.find(":")
                 pred = pred[colon_idx + 1 :].strip()
 
-        if GROUND_TRUTH_COLUMN in x:
-            return {
-                PREDICTION_COLUMN: pred,
-                GROUND_TRUTH_COLUMN: x[GROUND_TRUTH_COLUMN],
-            }
-        else:
-            return {PREDICTION_COLUMN: pred}
+        return {
+            PREDICTION_COLUMN: pred,
+            **x,
+        }
diff --git a/lcm/evaluation/tasks/cnn_dailymail.py b/lcm/evaluation/tasks/cnn_dailymail.py
@@ -58,8 +58,8 @@ def get_task_config_llm(
     dataset.target_text_column = target_text_column
 
     # Add original columns for judge tasks
-    dataset.columns = [source_text_column, target_text_column]
-    postprocess_fn = partial(default_text_postprocess,source_text_column=source_text_column)  # fmt: skip
+    dataset.columns = [dataset.source_text_column, dataset.target_text_column]
+    postprocess_fn = partial(default_text_postprocess, source_text_column=dataset.source_text_column)  # fmt: skip
 
     return GenerationTaskConfig(
         dataset=dataset,

diff --git a/lcm/evaluation/tasks/xsum.py b/lcm/evaluation/tasks/xsum.py
@@ -55,7 +55,7 @@ def get_task_config_llm(
     dataset.target_text_column = target_text_column
 
     # Add original columns for judge tasks
-    dataset.columns = [source_text_column, target_text_column]
+    dataset.columns = [dataset.source_text_column, dataset.target_text_column]
 
     postprocess_fn = partial(
         default_text_postprocess, source_text_column=source_text_column

diff --git a/lcm/evaluation/utils/data_utils.py b/lcm/evaluation/utils/data_utils.py
@@ -150,7 +150,9 @@ def _add_affix(
     if column in inp.keys():
         inp[column] = str(prefix or "") + to_str(inp[column]) + str(suffix or "")
     else:
-        assert orig_column in inp, f"Missing {column} or {orig_column}"
+        assert orig_column in inp, (
+            f"Missing {column} or {orig_column} (Found columns: {inp.keys()})"
+        )
         inp[column] = str(prefix or "") + to_str(inp[orig_column]) + str(suffix or "")
     return inp