From 9ed7130f57ac2a86b8564e390ae5cf338cbdddd7 Mon Sep 17 00:00:00 2001 From: Open LLM Leaderboard PR Bot Date: Mon, 8 Apr 2024 07:49:51 +0000 Subject: [PATCH] Adding Evaluation Results This is an automated PR created with https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card. If you encounter any issues, please report them to https://huggingface.co/spaces/Weyaxi/open-llm-leaderboard-results-pr/discussions --- README.md | 135 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 9737c9c..4e38543 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,123 @@ --- +language: +- en license: apache-2.0 datasets: - cerebras/SlimPajama-627B - bigcode/starcoderdata - HuggingFaceH4/ultrachat_200k - HuggingFaceH4/ultrafeedback_binarized -language: -- en widget: - - example_title: Fibonacci (Python) - messages: - - role: system - content: You are a chatbot who can help code! - - role: user - content: Write me a function to calculate the first 10 digits of the fibonacci sequence in Python and print it out to the CLI. +- example_title: Fibonacci (Python) + messages: + - role: system + content: You are a chatbot who can help code! + - role: user + content: Write me a function to calculate the first 10 digits of the fibonacci + sequence in Python and print it out to the CLI. +model-index: +- name: TinyLlama-1.1B-Chat-v1.0 + results: + - task: + type: text-generation + name: Text Generation + dataset: + name: AI2 Reasoning Challenge (25-Shot) + type: ai2_arc + config: ARC-Challenge + split: test + args: + num_few_shot: 25 + metrics: + - type: acc_norm + value: 36.09 + name: normalized accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TinyLlama/TinyLlama-1.1B-Chat-v1.0 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: HellaSwag (10-Shot) + type: hellaswag + split: validation + args: + num_few_shot: 10 + metrics: + - type: acc_norm + value: 61.1 + name: normalized accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TinyLlama/TinyLlama-1.1B-Chat-v1.0 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: MMLU (5-Shot) + type: cais/mmlu + config: all + split: test + args: + num_few_shot: 5 + metrics: + - type: acc + value: 25.39 + name: accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TinyLlama/TinyLlama-1.1B-Chat-v1.0 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: TruthfulQA (0-shot) + type: truthful_qa + config: multiple_choice + split: validation + args: + num_few_shot: 0 + metrics: + - type: mc2 + value: 37.48 + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TinyLlama/TinyLlama-1.1B-Chat-v1.0 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: Winogrande (5-shot) + type: winogrande + config: winogrande_xl + split: validation + args: + num_few_shot: 5 + metrics: + - type: acc + value: 61.25 + name: accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TinyLlama/TinyLlama-1.1B-Chat-v1.0 + name: Open LLM Leaderboard + - task: + type: text-generation + name: Text Generation + dataset: + name: GSM8k (5-shot) + type: gsm8k + config: main + split: test + args: + num_few_shot: 5 + metrics: + - type: acc + value: 2.35 + name: accuracy + source: + url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=TinyLlama/TinyLlama-1.1B-Chat-v1.0 + name: Open LLM Leaderboard ---
@@ -63,4 +167,17 @@ print(outputs[0]["generated_text"]) # How many helicopters can a human eat in one sitting? # <|assistant|> # ... -``` \ No newline at end of file +``` +# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) +Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_TinyLlama__TinyLlama-1.1B-Chat-v1.0) + +| Metric |Value| +|---------------------------------|----:| +|Avg. |37.28| +|AI2 Reasoning Challenge (25-Shot)|36.09| +|HellaSwag (10-Shot) |61.10| +|MMLU (5-Shot) |25.39| +|TruthfulQA (0-shot) |37.48| +|Winogrande (5-shot) |61.25| +|GSM8k (5-shot) | 2.35| +