Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changed the formatting of monitor.py #3592

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 128 additions & 38 deletions fastchat/serve/monitor/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,41 @@ def get_full_table(arena_df, model_table_df, model_to_score):


def arena_hard_process(leaderboard_table_file, filepath):
arena_hard = pd.read_csv(filepath)
with open(filepath, "rb") as f: # 'rb' is for reading in binary mode
combined_arena_hard = pickle.load(f) # normal + styled results

arena_hard_normal = pd.DataFrame(combined_arena_hard["normal"]) # normal
arena_hard_style = pd.DataFrame(combined_arena_hard["style_control"]) # style

normal_rankings = recompute_final_ranking(arena_hard_normal)
style_rankings = recompute_final_ranking(arena_hard_style)

arena_hard_normal.insert(
loc=0, column="Rank* (UB)", value=normal_rankings
) # normal rankings
arena_hard_style.insert(
loc=0, column="Rank (StyleCtrl)", value=style_rankings
) # style rankings

arena_hard_normal["avg_tokens"] = arena_hard_normal["avg_tokens"].astype(int)

# combine together
combined_df = pd.merge(
arena_hard_normal,
arena_hard_style[["model", "Rank (StyleCtrl)"]],
on="model",
how="left",
)

# Move 'Rank (StyleCtrl)' to position 1 (second column)
columns = list(combined_df.columns)
columns.insert(1, columns.pop(columns.index("Rank (StyleCtrl)")))
combined_df = combined_df[columns]

combined_df["Rank (StyleCtrl)"] = combined_df["Rank (StyleCtrl)"].astype(
int
) # convert the ranking to integer values

leaderboard_table = pd.read_csv(leaderboard_table_file)
links = leaderboard_table.get("Link")
display_name = leaderboard_table.get("Model")
Expand All @@ -265,23 +299,21 @@ def arena_hard_process(leaderboard_table_file, filepath):
info[model_name[i]] = model_info

organization = []
for i in range(len(arena_hard)):
for i in range(len(combined_df)):
assert (
arena_hard.loc[i, "model"] in info
), f"need to update leaderboard_table info by adding {arena_hard.loc[i, 'model']}"
organization.append(info[arena_hard.loc[i, "model"]]["org"])
link = info[arena_hard.loc[i, "model"]]["link"]
arena_hard.loc[i, "model"] = model_hyperlink(
info[arena_hard.loc[i, "model"]]["display"], link
combined_df.loc[i, "model"] in info
), f"need to update leaderboard_table info by adding {combined_df.loc[i, 'model']}"
organization.append(info[combined_df.loc[i, "model"]]["org"])
link = info[combined_df.loc[i, "model"]]["link"]
combined_df.loc[i, "model"] = model_hyperlink(
info[combined_df.loc[i, "model"]]["display"], link
)

arena_hard.insert(
loc=len(arena_hard.columns), column="Organization", value=organization
combined_df.insert(
loc=len(combined_df.columns), column="Organization", value=organization
)

rankings = recompute_final_ranking(arena_hard)
arena_hard.insert(loc=0, column="Rank* (UB)", value=rankings)
return arena_hard
return combined_df


def get_arena_table(
Expand Down Expand Up @@ -398,6 +430,28 @@ def highlight_rank_max(s):
return elo_dataframe.style.apply(highlight_rank_max, subset=["Delta"])


# highlight the style control rank value as red
def highlight_red(s):
return [("color: red; font-weight: bold") for v in s]


# highlight the style control rank value as green
def highlight_green(s):
return [("color: green; font-weight: bold") for v in s]


# decide whether to highlight the style control rank value as green or red
def compare_func(row):
if row["Rank (StyleCtrl)"] is None:
return 0
if row["Rank (StyleCtrl)"] == row["Rank* (UB)"]:
return 0
elif row["Rank (StyleCtrl)"] < row["Rank* (UB)"]:
return 1
else:
return -1


def update_overall_leaderboard_df(arena_table_vals):
columns = [
"Rank* (UB)",
Expand All @@ -412,30 +466,38 @@ def update_overall_leaderboard_df(arena_table_vals):
]
elo_dataframe = pd.DataFrame(arena_table_vals, columns=columns)

def highlight_red(s):
return [("color: red; font-weight: bold") for v in s]
comparison = elo_dataframe.apply(
compare_func,
axis=1,
)
indices_red = [i for i, value in enumerate(comparison) if value == -1]
indices_green = [i for i, value in enumerate(comparison) if value == 1]

def highlight_green(s):
return [("color: green; font-weight: bold") for v in s]
return elo_dataframe.style.apply(
highlight_red, subset=pd.IndexSlice[indices_red, ["Rank (StyleCtrl)"]]
).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]])

def compare_func(row):
if row["Rank (StyleCtrl)"] is None:
return 0
if row["Rank (StyleCtrl)"] == row["Rank* (UB)"]:
return 0
elif row["Rank (StyleCtrl)"] < row["Rank* (UB)"]:
return 1
else:
return -1

comparison = elo_dataframe.apply(
def update_hard_leaderboard_df(arena_table_vals):
columns = [
"Rank* (UB)",
"Rank (StyleCtrl)",
"Model",
"Win-rate",
"95% CI",
"Average Tokens",
"Organization",
]
dataframe = pd.DataFrame(arena_table_vals, columns=columns)

comparison = dataframe.apply(
compare_func,
axis=1,
)
indices_red = [i for i, value in enumerate(comparison) if value == -1]
indices_green = [i for i, value in enumerate(comparison) if value == 1]

return elo_dataframe.style.apply(
return dataframe.style.apply(
highlight_red, subset=pd.IndexSlice[indices_red, ["Rank (StyleCtrl)"]]
).apply(highlight_green, subset=pd.IndexSlice[indices_green, ["Rank (StyleCtrl)"]])

Expand Down Expand Up @@ -493,9 +555,9 @@ def update_leaderboard_and_plots(category, filters):
arena_values = get_arena_table(
arena_df,
model_table_df,
arena_subset_df=arena_subset_df
if category != "Overall"
else arena_overall_sc_df,
arena_subset_df=(
arena_subset_df if category != "Overall" else arena_overall_sc_df
),
hidden_models=(
None
if len(filters) > 0 and "Show Deprecated" in filters
Expand Down Expand Up @@ -1013,12 +1075,14 @@ def build_leaderboard_tab(
dataFrame = arena_hard_process(
leaderboard_table_file, arena_hard_leaderboard
)

date = dataFrame["date"][0]
dataFrame = dataFrame.drop(
columns=["rating_q025", "rating_q975", "date"]
)
dataFrame["CI"] = dataFrame.CI.map(ast.literal_eval)
dataFrame["CI"] = dataFrame.CI.map(lambda x: f"+{x[1]}/-{x[0]}")
dataFrame["CI"] = dataFrame.CI.map(lambda x: f"+{x[1]}/{x[0]}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we remove - here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there was an extra "-" displayed so something like "--3.2".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh got it. thanks!


dataFrame = dataFrame.rename(
columns={
"model": "Model",
Expand All @@ -1027,23 +1091,49 @@ def build_leaderboard_tab(
"avg_tokens": "Average Tokens",
}
)
dataFrame["Win-rate"] = dataFrame["Win-rate"].apply(
lambda x: f"{x:g}" if pd.notnull(x) else x
)

model_to_score = {}
for i in range(len(dataFrame)):
model_to_score[dataFrame.loc[i, "Model"]] = dataFrame.loc[
i, "Win-rate"
]
model_name = dataFrame.loc[i, "Model"]
win_rate = dataFrame.loc[i, "Win-rate"]

model_to_score[model_name] = win_rate

md = arena_hard_title(date)
gr.Markdown(md, elem_id="leaderboard_markdown")

dataFrame = update_hard_leaderboard_df(dataFrame)
gr.DataFrame(
dataFrame,
datatype=[
"markdown" if col == "Model" else "str"
for col in dataFrame.columns
"number",
"number",
"markdown",
"str",
"str",
"number",
"str",
],
elem_id="arena_hard_leaderboard",
height=1000,
wrap=True,
column_widths=[70, 190, 80, 80, 90, 150],
column_widths=[70, 70, 190, 80, 80, 90, 150],
)
gr.Markdown(
f"""
***Rank (UB)**: model's ranking (upper-bound), defined by one + the number of models that are statistically better than the target model.
Model A is statistically better than model B when A's lower-bound score is greater than B's upper-bound score (in 95% confidence interval).
See Figure 1 below for visualization of the confidence intervals of model scores.

**Rank (StyleCtrl)**: model's ranking with style control, which accounts for factors like response length and markdown usage to decouple model performance from these potential confounding variables.
See [blog post](https://blog.lmarena.ai/blog/2024/style-control/) for further details.

Note: in each category, we exclude models with fewer than 300 votes as their confidence intervals can be large.
""",
elem_id="leaderboard_markdown",
)

with gr.Tab("Full Leaderboard", id=4):
Expand Down
Loading