From a56a0350ea2155b728b624a20f69b4cce06754f8 Mon Sep 17 00:00:00 2001 From: cheng Date: Wed, 15 May 2024 10:40:53 -0400 Subject: [PATCH] add excel to webarena custom tabulate --- .../autogenbench/autogenbench/tabulate_cmd.py | 22 ++- .../WebArena/Scripts/custom_tabulate.py | 130 +++++++++++++++++- 2 files changed, 148 insertions(+), 4 deletions(-) diff --git a/samples/tools/autogenbench/autogenbench/tabulate_cmd.py b/samples/tools/autogenbench/autogenbench/tabulate_cmd.py index e6587c8576d..a65e3e6d59a 100644 --- a/samples/tools/autogenbench/autogenbench/tabulate_cmd.py +++ b/samples/tools/autogenbench/autogenbench/tabulate_cmd.py @@ -3,6 +3,7 @@ import argparse import tabulate as tb from .load_module import load_module +from copy import deepcopy # Figure out where everything is SCRIPT_PATH = os.path.realpath(__file__) @@ -104,6 +105,13 @@ def default_tabulate(args, scorer=default_scorer, exclude_dir_names=EXCLUDE_DIR_ help="Output the results in CSV format.", ) + parser.add_argument( + "-e", + "--excel", + help="Output the results in Excel format. Please specify a path for the Excel file.", + type=str + ) + parsed_args = parser.parse_args(args) all_results = list() @@ -161,15 +169,17 @@ def default_tabulate(args, scorer=default_scorer, exclude_dir_names=EXCLUDE_DIR_ def _count_equals(value, trial): count = 0 for row in all_results: + is_answer_matched = row[trial + 1][0] if isinstance(row[trial + 1], tuple) else row[trial + 1] + # Count missing if value is None: if trial + 1 < len(row): - if row[trial + 1] is None: + if is_answer_matched is None: count += 1 else: count += 1 # Count match - elif trial + 1 < len(row) and row[trial + 1] == value: + elif trial + 1 < len(row) and is_answer_matched == value: count += 1 return count @@ -194,7 +204,12 @@ def _count_equals(value, trial): footer_row.append(footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1]) footer.append(footer_row) - table = all_results.copy() + table = deepcopy(all_results) + for row in table: + for trial in range(0, max_instances): + if isinstance(row[trial + 1], tuple): + row[trial + 1] = row[trial + 1][0] + table.append(tb.SEPARATING_LINE) table.extend(footer) @@ -202,6 +217,7 @@ def _count_equals(value, trial): # Print out alpha-version warning sys.stderr.write("\n" + warning + "\n\n") + return parsed_args, all_results def tabulate_cli(args): diff --git a/samples/tools/autogenbench/scenarios/WebArena/Scripts/custom_tabulate.py b/samples/tools/autogenbench/scenarios/WebArena/Scripts/custom_tabulate.py index daedc27864d..289dccd42df 100644 --- a/samples/tools/autogenbench/scenarios/WebArena/Scripts/custom_tabulate.py +++ b/samples/tools/autogenbench/scenarios/WebArena/Scripts/custom_tabulate.py @@ -1,14 +1,142 @@ import os import sys from autogenbench.tabulate_cmd import default_tabulate, default_scorer +import json +import pandas as pd +import sqlite3 +import glob + +EXCLUDE_DIR_NAMES = ["__pycache__"] def scorer(instance_dir, success_strings=["FINAL SCORE: 1"]): return default_scorer(instance_dir, success_strings=success_strings) +def get_number_of_chat_messages(chat_messages_dir): + result = 0 + for file in glob.glob(f"{chat_messages_dir}/*_messages.json"): + with open(file, "r") as f: + content = json.load(f) + for agent, messages in content.items(): + result += len(messages) + return result + + def main(args): - default_tabulate(args, scorer=scorer) + parsed_args, all_results = default_tabulate(args, scorer=scorer) + excel_path = parsed_args.excel + + if excel_path: + excel_dir = os.path.dirname(excel_path) or "." + if not os.path.exists(excel_dir): + os.makedirs(excel_dir, exist_ok=True) + + if not excel_path.endswith((".xlsx", ".xls")): + excel_path += ".xlsx" + + runlogs = parsed_args.runlogs if parsed_args.runlogs.endswith("/") else parsed_args.runlogs + "/" + + if os.path.isdir(runlogs): + task_ids = sorted( + [task_id for task_id in os.listdir(runlogs) if task_id not in EXCLUDE_DIR_NAMES], + key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)), + ) + else: + raise ValueError("please input a valid directory to tabulate result") + + trials = sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x)) if len(task_ids) > 0 else [] + dbnames = [[f"{runlogs}{task_id}/{trial}/telemetry.sqlite" for task_id in task_ids] for trial in trials] + + query = """ + SELECT cost, session_id, response, start_time, end_time + FROM ( + SELECT invocation_id, cost, session_id, response, start_time, end_time, + ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn + FROM chat_completions + ) + WHERE rn = 1; + """ + + with pd.ExcelWriter(excel_path, engine="openpyxl") as writer: + for trial_index, each_trial in enumerate(dbnames): + result_df = pd.DataFrame( + columns=[ + "id", + "status", + "cost", + "latency", + "num_of_llm_requests", + "num_of_chat_messages", + "prompt_tokens", + "completion_tokens", + "total_tokens", + "model", + ] + ) + + result_df_type_mapping = { + "id": str, + "status": str, + "cost": float, + "latency": float, + "num_of_llm_requests": int, + "num_of_chat_messages": int, + "prompt_tokens": int, + "completion_tokens": int, + "total_tokens": int, + } + + for dbname, scorer_results in zip(each_trial, all_results): + task_id = scorer_results[0] + scorer_result = scorer_results[trial_index + 1] + + status = scorer_result + con = sqlite3.connect(dbname) + + # TODO: if large amount of data, add chunksize + telemetry_df = pd.read_sql_query(query, con) + + earliest_starttime = pd.to_datetime(telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f").min() + latest_endtime = pd.to_datetime(telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f").max() + + num_of_chat_messages = get_number_of_chat_messages(chat_messages_dir=os.path.dirname(dbname)) + result = { + "id": task_id, + "status": status, + "cost": telemetry_df["cost"].sum(), + "latency": (latest_endtime - earliest_starttime).total_seconds(), + "num_of_llm_requests": len(telemetry_df), + "num_of_chat_messages": num_of_chat_messages, + "prompt_tokens": telemetry_df["response"] + .apply( + lambda x: json.loads(x)["usage"]["prompt_tokens"] + if "usage" in json.loads(x) and "prompt_tokens" in json.loads(x)["usage"] + else 0 + ) + .sum(), + "completion_tokens": telemetry_df["response"] + .apply( + lambda x: json.loads(x)["usage"]["completion_tokens"] + if "usage" in json.loads(x) and "completion_tokens" in json.loads(x)["usage"] + else 0 + ) + .sum(), + "total_tokens": telemetry_df["response"] + .apply( + lambda x: json.loads(x)["usage"]["total_tokens"] + if "usage" in json.loads(x) and "total_tokens" in json.loads(x)["usage"] + else 0 + ) + .sum(), + "model": telemetry_df["response"] + .apply(lambda x: json.loads(x)["model"] if "model" in json.loads(x) else "") + .unique(), + } + + result_df = result_df.astype(result_df_type_mapping) + result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True) + result_df.to_excel(writer, sheet_name=f"trial_{trial_index}", index=False) if __name__ == "__main__" and __package__ is None: