From a56a0350ea2155b728b624a20f69b4cce06754f8 Mon Sep 17 00:00:00 2001
From: cheng <chengtan2013@gmail.com>
Date: Wed, 15 May 2024 10:40:53 -0400
Subject: [PATCH] add excel to webarena custom tabulate

---
 .../autogenbench/autogenbench/tabulate_cmd.py |  22 ++-
 .../WebArena/Scripts/custom_tabulate.py       | 130 +++++++++++++++++-
 2 files changed, 148 insertions(+), 4 deletions(-)

diff --git a/samples/tools/autogenbench/autogenbench/tabulate_cmd.py b/samples/tools/autogenbench/autogenbench/tabulate_cmd.py
index e6587c8576d..a65e3e6d59a 100644
--- a/samples/tools/autogenbench/autogenbench/tabulate_cmd.py
+++ b/samples/tools/autogenbench/autogenbench/tabulate_cmd.py
@@ -3,6 +3,7 @@
 import argparse
 import tabulate as tb
 from .load_module import load_module
+from copy import deepcopy
 
 # Figure out where everything is
 SCRIPT_PATH = os.path.realpath(__file__)
@@ -104,6 +105,13 @@ def default_tabulate(args, scorer=default_scorer, exclude_dir_names=EXCLUDE_DIR_
         help="Output the results in CSV format.",
     )
 
+    parser.add_argument(
+        "-e",
+        "--excel",
+        help="Output the results in Excel format. Please specify a path for the Excel file.",
+        type=str
+    )
+
     parsed_args = parser.parse_args(args)
 
     all_results = list()
@@ -161,15 +169,17 @@ def default_tabulate(args, scorer=default_scorer, exclude_dir_names=EXCLUDE_DIR_
         def _count_equals(value, trial):
             count = 0
             for row in all_results:
+                is_answer_matched = row[trial + 1][0] if isinstance(row[trial + 1], tuple) else row[trial + 1]
+
                 # Count missing
                 if value is None:
                     if trial + 1 < len(row):
-                        if row[trial + 1] is None:
+                        if is_answer_matched is None:
                             count += 1
                     else:
                         count += 1
                 # Count match
-                elif trial + 1 < len(row) and row[trial + 1] == value:
+                elif trial + 1 < len(row) and is_answer_matched == value:
                     count += 1
             return count
 
@@ -194,7 +204,12 @@ def _count_equals(value, trial):
             footer_row.append(footer[0][i + 1] + footer[1][i + 1] + footer[2][i + 1])
         footer.append(footer_row)
 
-        table = all_results.copy()
+        table = deepcopy(all_results)
+        for row in table:
+            for trial in range(0, max_instances):
+                if isinstance(row[trial + 1], tuple):
+                    row[trial + 1] = row[trial + 1][0]
+
         table.append(tb.SEPARATING_LINE)
         table.extend(footer)
 
@@ -202,6 +217,7 @@ def _count_equals(value, trial):
 
         # Print out alpha-version warning
         sys.stderr.write("\n" + warning + "\n\n")
+    return parsed_args, all_results
 
 
 def tabulate_cli(args):
diff --git a/samples/tools/autogenbench/scenarios/WebArena/Scripts/custom_tabulate.py b/samples/tools/autogenbench/scenarios/WebArena/Scripts/custom_tabulate.py
index daedc27864d..289dccd42df 100644
--- a/samples/tools/autogenbench/scenarios/WebArena/Scripts/custom_tabulate.py
+++ b/samples/tools/autogenbench/scenarios/WebArena/Scripts/custom_tabulate.py
@@ -1,14 +1,142 @@
 import os
 import sys
 from autogenbench.tabulate_cmd import default_tabulate, default_scorer
+import json
+import pandas as pd
+import sqlite3
+import glob
+
+EXCLUDE_DIR_NAMES = ["__pycache__"]
 
 
 def scorer(instance_dir, success_strings=["FINAL SCORE: 1"]):
     return default_scorer(instance_dir, success_strings=success_strings)
 
 
+def get_number_of_chat_messages(chat_messages_dir):
+    result = 0
+    for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
+        with open(file, "r") as f:
+            content = json.load(f)
+            for agent, messages in content.items():
+                result += len(messages)
+    return result
+
+
 def main(args):
-    default_tabulate(args, scorer=scorer)
+    parsed_args, all_results = default_tabulate(args, scorer=scorer)
+    excel_path = parsed_args.excel
+
+    if excel_path:
+        excel_dir = os.path.dirname(excel_path) or "."
+        if not os.path.exists(excel_dir):
+            os.makedirs(excel_dir, exist_ok=True)
+
+        if not excel_path.endswith((".xlsx", ".xls")):
+            excel_path += ".xlsx"
+
+        runlogs = parsed_args.runlogs if parsed_args.runlogs.endswith("/") else parsed_args.runlogs + "/"
+
+        if os.path.isdir(runlogs):
+            task_ids = sorted(
+                [task_id for task_id in os.listdir(runlogs) if task_id not in EXCLUDE_DIR_NAMES],
+                key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
+            )
+        else:
+            raise ValueError("please input a valid directory to tabulate result")
+
+        trials = sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x)) if len(task_ids) > 0 else []
+        dbnames = [[f"{runlogs}{task_id}/{trial}/telemetry.sqlite" for task_id in task_ids] for trial in trials]
+
+        query = """
+            SELECT cost, session_id, response, start_time, end_time
+            FROM (
+                SELECT invocation_id, cost, session_id, response, start_time, end_time,
+                    ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
+                FROM chat_completions
+            )
+            WHERE rn = 1;
+        """
+
+        with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
+            for trial_index, each_trial in enumerate(dbnames):
+                result_df = pd.DataFrame(
+                    columns=[
+                        "id",
+                        "status",
+                        "cost",
+                        "latency",
+                        "num_of_llm_requests",
+                        "num_of_chat_messages",
+                        "prompt_tokens",
+                        "completion_tokens",
+                        "total_tokens",
+                        "model",
+                    ]
+                )
+
+                result_df_type_mapping = {
+                    "id": str,
+                    "status": str,
+                    "cost": float,
+                    "latency": float,
+                    "num_of_llm_requests": int,
+                    "num_of_chat_messages": int,
+                    "prompt_tokens": int,
+                    "completion_tokens": int,
+                    "total_tokens": int,
+                }
+
+                for dbname, scorer_results in zip(each_trial, all_results):
+                    task_id = scorer_results[0]
+                    scorer_result = scorer_results[trial_index + 1]
+
+                    status = scorer_result
+                    con = sqlite3.connect(dbname)
+
+                    # TODO: if large amount of data, add chunksize
+                    telemetry_df = pd.read_sql_query(query, con)
+
+                    earliest_starttime = pd.to_datetime(telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f").min()
+                    latest_endtime = pd.to_datetime(telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f").max()
+
+                    num_of_chat_messages = get_number_of_chat_messages(chat_messages_dir=os.path.dirname(dbname))
+                    result = {
+                        "id": task_id,
+                        "status": status,
+                        "cost": telemetry_df["cost"].sum(),
+                        "latency": (latest_endtime - earliest_starttime).total_seconds(),
+                        "num_of_llm_requests": len(telemetry_df),
+                        "num_of_chat_messages": num_of_chat_messages,
+                        "prompt_tokens": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["usage"]["prompt_tokens"]
+                            if "usage" in json.loads(x) and "prompt_tokens" in json.loads(x)["usage"]
+                            else 0
+                        )
+                        .sum(),
+                        "completion_tokens": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["usage"]["completion_tokens"]
+                            if "usage" in json.loads(x) and "completion_tokens" in json.loads(x)["usage"]
+                            else 0
+                        )
+                        .sum(),
+                        "total_tokens": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["usage"]["total_tokens"]
+                            if "usage" in json.loads(x) and "total_tokens" in json.loads(x)["usage"]
+                            else 0
+                        )
+                        .sum(),
+                        "model": telemetry_df["response"]
+                        .apply(lambda x: json.loads(x)["model"] if "model" in json.loads(x) else "")
+                        .unique(),
+                    }
+
+                    result_df = result_df.astype(result_df_type_mapping)
+                    result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
+                result_df.to_excel(writer, sheet_name=f"trial_{trial_index}", index=False)
 
 
 if __name__ == "__main__" and __package__ is None: