diff --git a/experimental/eval/.gitignore b/experimental/eval/.gitignore new file mode 100644 index 000000000000..a7daadac731f --- /dev/null +++ b/experimental/eval/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +.ipynb_checkpoints +reports.* +tabby diff --git a/experimental/eval/README.md b/experimental/eval/README.md new file mode 100644 index 000000000000..13c7d13afc1d --- /dev/null +++ b/experimental/eval/README.md @@ -0,0 +1,7 @@ +# Eval + +## Local +`./eval.sh` + +## Skypilot +`./eval_sky.sh` diff --git a/experimental/eval/config.toml b/experimental/eval/config.toml new file mode 100644 index 000000000000..777e413be84a --- /dev/null +++ b/experimental/eval/config.toml @@ -0,0 +1,2 @@ +[[repositories]] +git_url = "https://github.com/huggingface/text-generation-inference" diff --git a/experimental/eval/docker-compose.cuda.yaml b/experimental/eval/docker-compose.cuda.yaml new file mode 100644 index 000000000000..0c47cebc1ac0 --- /dev/null +++ b/experimental/eval/docker-compose.cuda.yaml @@ -0,0 +1,11 @@ +version: '3.5' +services: + tabby: + command: serve --model TabbyML/SantaCoder-1B --device cuda + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] diff --git a/experimental/eval/docker-compose.yaml b/experimental/eval/docker-compose.yaml index c3e1ef54475a..2e0ef34fb18a 100644 --- a/experimental/eval/docker-compose.yaml +++ b/experimental/eval/docker-compose.yaml @@ -6,7 +6,7 @@ services: platform: linux/amd64 command: scheduler --now volumes: - - "$HOME/.tabby:/data" + - "$PWD/tabby:/data" tabby: depends_on: @@ -15,5 +15,7 @@ services: image: tabbyml/tabby platform: linux/amd64 command: serve --model TabbyML/T5P-220M + ports: + - "8080:8080" volumes: - - "$HOME/.tabby:/data" + - "$PWD/tabby:/data" diff --git a/experimental/eval/eval.sh b/experimental/eval/eval.sh new file mode 100755 index 000000000000..6a697f4612b6 --- /dev/null +++ b/experimental/eval/eval.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -ex + +mkdir -p tabby +cp config.toml tabby/ + +docker-compose down + +if nvidia-smi; then + docker-compose -f docker-compose.yaml -f docker-compose.cuda.yaml up -d +else + docker-compose up -d +fi + +while ! curl -X POST http://localhost:8080/v1/health; do + echo "server not ready, waiting..." + sleep 5 +done + +papermill main.ipynb ./reports.ipynb -r filepattern "./tabby/dataset/*.jsonl" -r max_records "${MAX_RECORDS:-3}" + +jupyter nbconvert reports.ipynb --TagRemovePreprocessor.enabled=True --TagRemovePreprocessor.remove_cell_tags remove --to html + +docker-compose down + +echo done diff --git a/experimental/eval/eval_sky.sh b/experimental/eval/eval_sky.sh new file mode 100755 index 000000000000..6df66a2911cb --- /dev/null +++ b/experimental/eval/eval_sky.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -ex + +ARGS="tabby-eval skypilot.yaml --env MAX_RECORDS=300" + +if ! sky exec $ARGS; then + sky launch -c $ARGS +fi + +scp tabby-eval:~/sky_workdir/reports.ipynb ./ +scp tabby-eval:~/sky_workdir/reports.html ./ diff --git a/experimental/eval/main.ipynb b/experimental/eval/main.ipynb new file mode 100644 index 000000000000..0ddec3e620c9 --- /dev/null +++ b/experimental/eval/main.ipynb @@ -0,0 +1,387 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 10, + "id": "cd3b4cc2-0bb3-4fba-9c92-48e40f5419c4", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "filepattern = \"tabby/dataset/data.jsonl\"\n", + "api = \"http://localhost:8080\"\n", + "max_records = \"3\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f12319d9", + "metadata": { + "tags": [ + "remove" + ] + }, + "outputs": [], + "source": [ + "max_records = int(max_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "172d7105-ecac-4019-bbe1-dcd70ed6af60", + "metadata": { + "tags": [ + "remove" + ] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from tabby_client import Client\n", + "from tabby_client.api.v1 import health\n", + "from tabby_client.api.v1 import completion\n", + "\n", + "from tabby_client.models import CompletionRequest, CompletionRequest, Segments, Choice\n", + "\n", + "import processing\n", + "import editdistance\n", + "import random\n", + "\n", + "\n", + "def valid_item(item: processing.Item):\n", + " count_body_lines = len(item.body.splitlines())\n", + "\n", + " if count_body_lines > 10:\n", + " return False\n", + "\n", + " return True\n", + "\n", + "\n", + "def scorer(label, prediction):\n", + " distance = editdistance.eval(label, prediction)\n", + " return max(0.0, 1.0 - distance / len(label))\n", + "\n", + "\n", + "def run_eval():\n", + " client = Client(base_url=api, timeout=50)\n", + " try:\n", + " health.sync(client=client)\n", + " except:\n", + " print(f\"Tabby Server is not ready, please check if '{api}' is correct.\")\n", + " return\n", + " \n", + " items = [x for x in processing.items_from_filepattern(filepattern) if valid_item(x)];\n", + " if len(items) > max_records:\n", + " random.seed(0xbadbeef)\n", + " items = random.sample(items, max_records)\n", + " \n", + "\n", + " for item in items:\n", + " if not valid_item(item):\n", + " continue\n", + "\n", + " request = CompletionRequest(\n", + " language=item.language, segments=Segments(prefix=item.prefix)\n", + " )\n", + "\n", + " resp: CompletionResponse = completion.sync(client=client, json_body=request)\n", + " label = item.body\n", + " prediction = resp.choices[0].text\n", + "\n", + " block_score = scorer(label, prediction)\n", + " \n", + " label_lines = label.splitlines()\n", + " prediction_lines = prediction.splitlines()\n", + " \n", + " if len(label_lines) > 0 and len(prediction_lines) > 0:\n", + " line_score = scorer(label_lines[0], prediction_lines[0])\n", + "\n", + " yield dict(\n", + " prompt=item.prefix,\n", + " prediction=prediction,\n", + " label=label,\n", + " block_score=block_score,\n", + " line_score=line_score,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "76c08e41-42fc-486a-96b3-5cf647635e90", + "metadata": { + "tags": [ + "remove" + ] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame(list(run_eval()))" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "038f9c95-edf4-463a-a600-d1945b17c235", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([], dtype=object)" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, axes = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True, figsize=(15, 5))\n", + "\n", + "df.hist(\n", + " column=\"line_score\",\n", + " ax=axes[0],\n", + ")\n", + "\n", + "df.hist(\n", + " column=\"block_score\",\n", + " ax=axes[1],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "3b8d339e-4452-4e2c-823c-0f69f6eb4805", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 promptpredictionlabelblock_scoreline_score
0 attentions=all_attentions,\n", + " cross_attentions=all_cross_attentions,\n", + " )\n", + "\n", + "\n", + "class T5ForConditionalGeneration(T5PreTrainedModel):\n", + " def __init__(self, config: T5Config, weights):\n", + " super().__init__(config)\n", + " self.model_dim = config.d_model\n", + "\n", + " try:\n", + " self.shared = TensorParallelEmbedding(prefix=\"shared\", weights=weights)\n", + " except RuntimeError:\n", + " self.shared = TensorParallelEmbedding(prefix=\"encoder.embed_tokens\", weights=weights)\n", + "\n", + " encoder_config = copy.deepcopy(config)\n", + " encoder_config.is_decoder = False\n", + " encoder_config.use_cache = False\n", + " encoder_config.is_encoder_decoder = False\n", + " self.encoder = /*\n", + " * Copyright (c) 2008-2021, Hazelcast, Inc. All Rights Reserved.\n", + " *\n", + " * Licensed under the Apache License, Version 2.0 (the \"License\");\n", + " * you may not use this file except in compliance with the License.\n", + " * You may obtain a copy of the License at\n", + " *\n", + " * http://www.apache.org/licenses/LICENSE-2.0\n", + " *\n", + " * Unless required by applicable law or agreed to in writing, software\n", + " * distributed under the License is distributed on an \"AS IS\" BASIS,T5Stack(\n", + " config=encoder_config,\n", + " prefix=\"encoder\",\n", + " weights=weights,\n", + " embed_tokens=self.shared,\n", + " )0.0000000.000000
1 past_present_indices,\n", + " past_key_values: Optional[torch.Tensor] = None,\n", + " pre_allocate_past_size: Optional[int] = None,\n", + " lm_head_indices: Optional[torch.Tensor] = None,\n", + " ):\n", + " hidden_states, present = self.gpt_neox(\n", + " input_ids,\n", + " position_ids,\n", + " start_seq,\n", + " end_seq,\n", + " start_seq_q,\n", + " end_seq_q,\n", + " max_s,\n", + " past_present_indices,\n", + " past_key_values,\n", + " pre_allocate_past_size,\n", + " )\n", + " if lm_head_indices is not None:\n", + " hidden_states = hidden_states[lm_head_indices]\n", + " logits = /*\n", + " * Copyright (c) 2008-2021, Hazelcast, Inc. All Rights Reserved.\n", + " *\n", + " * Licensed under the Apache License, Version 2.0 (the \"License\");\n", + " * you may not use this file except in compliance with the License.\n", + " * You may obtain a copy of the License at\n", + " *\n", + " * http://www.apache.org/licenses/LICENSE-2.0\n", + " *\n", + " * Unless required by applicable law or agreed to in writing, software\n", + " * distributed under the License is distributed on an \"AS IS\" BASIS,self.embed_out(hidden_states)0.0000000.000000
2 if not isinstance(serialized_data, List):\n", + " serialized_data = [serialized_data]\n", + " if not isinstance(snapshot_data, List):\n", + " snapshot_data = [snapshot_data]\n", + "\n", + " return len(snapshot_data) == len(serialized_data) and all(\n", + " [eq_response(r, o) for r, o in zip(serialized_data, snapshot_data)]\n", + " )\n", + "\n", + "\n", + "class LauncherHandle:\n", + " def __init__(self, port: int):\n", + " self.client = AsyncClient(f\"http://localhost:{port}\")\n", + "\n", + " def _inner_health(self):\n", + " raise NotImplementedError\n", + "\n", + " async def health(self, timeout: int = 60):\n", + " assert timeout > 0\n", + " for _ in /*\n", + " * Copyright (c) 2008-2021, Hazelcast, Inc. All Rights Reserved.\n", + " *\n", + " * Licensed under the Apache License, Version 2.0 (the \"License\");\n", + " * you may not use this file except in compliance with the License.\n", + " * You may obtain a copy of the License at\n", + " *\n", + " * http://www.apache.org/licenses/LICENSE-2.0\n", + " *\n", + " * Unless required by applicable law or agreed to in writing, software\n", + " * distributed under the License is distributed on an \"AS IS\" BASIS,range(timeout)0.0000000.000000
\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "codeStyle = {\n", + " \"selector\": \"td\",\n", + " \"props\": [\n", + " (\"white-space\", \"pre\"),\n", + " (\"font-family\", \"monospace\"),\n", + " (\"text-align\", \"left\"),\n", + " (\"max-width\", \"400px\"),\n", + " (\"overflow-x\", \"scroll\"),\n", + " ],\n", + "}\n", + "\n", + "df.style.set_table_styles(\n", + " {\n", + " \"prompt\": [codeStyle],\n", + " \"prediction\": [codeStyle],\n", + " \"label\": [codeStyle],\n", + " }\n", + ")" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experimental/eval/processing.py b/experimental/eval/processing.py new file mode 100644 index 000000000000..36a5433537e4 --- /dev/null +++ b/experimental/eval/processing.py @@ -0,0 +1,89 @@ +from typing import Iterator + +import glob +import json +from dataclasses import dataclass +from transformers import HfArgumentParser + + +@dataclass +class Item: + git_url: str + filepath: str + language: str + + name: str + body: str + prefix: str + suffix: str + + +def iter_items(doc) -> Iterator[Item]: + if doc["max_line_length"] > 500: + return + + if doc["avg_line_length"] < 10 or doc["avg_line_length"] > 200: + return + + if doc["alphanum_fraction"] < 0.25: + return + + for tag in doc["tags"]: + content = doc["content"] + name = get_content(content, tag["name_range"]) + body = get_content(content, tag["range"]) + + prefix = get_prefix(content, tag["range"]["start"]) + suffix = get_suffix(content, tag["range"]["end"]) + + yield Item( + name=name, + body=body, + prefix=prefix, + suffix=suffix, + git_url=doc["git_url"], + filepath=doc["filepath"], + language=doc["language"], + ) + + +def iter_docs(filepattern: str): + for filepath in glob.glob(filepattern): + with open(filepath) as f: + for line in f: + yield json.loads(line) + + +def get_content(content: str, range: dict): + return content[range["start"] : range["end"]] + + +def get_prefix(content: str, start: int, max=20): + num_lines = 0 + prefix_start = 0 + for prefix_start in range(start - 1, 0, -1): + if content[prefix_start] == "\n": + num_lines += 1 + + if num_lines == max: + break + + return content[prefix_start + 1 : start] + + +def get_suffix(content: str, end: int, max=20): + num_lines = 0 + suffix_end = end + for suffix_end in range(end, len(content)): + if content[suffix_end] == "\n": + num_lines += 1 + + if num_lines == max: + break + + return content[end : suffix_end - 1] + + +def items_from_filepattern(filepattern: str): + for doc in iter_docs(filepattern): + yield from iter_items(doc) diff --git a/experimental/eval/requirements.txt b/experimental/eval/requirements.txt new file mode 100644 index 000000000000..f834b1b467f2 --- /dev/null +++ b/experimental/eval/requirements.txt @@ -0,0 +1,6 @@ +papermill +git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client +transformers +editdistance +matplotlib +notebook diff --git a/experimental/eval/skypilot.yaml b/experimental/eval/skypilot.yaml index e3645e197b44..b957c346f38c 100644 --- a/experimental/eval/skypilot.yaml +++ b/experimental/eval/skypilot.yaml @@ -1,6 +1,9 @@ resources: accelerators: T4:1 +# tabby base dir +workdir: ./ + setup: | set -ex @@ -8,9 +11,8 @@ setup: | sudo curl -L https://github.com/docker/compose/releases/download/v2.17.2/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose sudo chmod a+x /usr/local/bin/docker-compose - # Pull tabby images. - git clone https://github.com/TabbyML/tabby.git || true - cd tabby/experimental + # Install tabby python client. + pip install -r requirements.txt # On certain cloud providers (e.g lambda cloud), the default user is not added to docker group, so we need sudo here sudo docker-compose pull @@ -20,5 +22,4 @@ setup: | run: | - cd tabby/experimental - sudo docker-compose up + ./eval.sh