From 6ca85a2bdc5742dfb9e239618022b0a4970141eb Mon Sep 17 00:00:00 2001 From: Mateusz Krainski Date: Tue, 30 Jul 2024 21:40:35 -0400 Subject: [PATCH 1/2] Make progress on the visualisation --- yafs/notebooks/analysis.ipynb | 185 ++++++++++++++++++++-------------- 1 file changed, 108 insertions(+), 77 deletions(-) diff --git a/yafs/notebooks/analysis.ipynb b/yafs/notebooks/analysis.ipynb index 6daf699..5e37f8c 100644 --- a/yafs/notebooks/analysis.ipynb +++ b/yafs/notebooks/analysis.ipynb @@ -28,16 +28,6 @@ "import numpy as np" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "fe086eca-23ad-4543-8bd4-5405bf06b560", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"this is a test\")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -67,12 +57,40 @@ "df[\"departure_date_dt\"] = pd.to_datetime(\n", " df[\"departure_date\"].apply(lambda x: f\"{x} 2024\")\n", ")\n", - "df[\"return_date_dt\"] = pd.to_datetime(df[\"return_date\"].apply(lambda x: f\"{x} 2024\"))\n", - "\n", + "df[\"return_date_dt\"] = pd.to_datetime(df[\"return_date\"].apply(lambda x: f\"{x} 2024\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a13843b", + "metadata": {}, + "outputs": [], + "source": [ "df[\"departure_dt\"] = pd.to_datetime(\n", - " df.apply(lambda row: f\"{row['departure']} {row['departure_date']} 2024\", axis=1)\n", - ")\n", - "df[\"flight_duration_dt\"] = pd.to_timedelta(df[\"duration\"])\n", + " df.apply(lambda row: f\"{row['departure']} {row['departure_date']} 2024\", axis=1),\n", + " format=\"%I:%M %p %d %b %Y\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39628bfa", + "metadata": {}, + "outputs": [], + "source": [ + "def parse_duration(duration):\n", + " pattern = r\"(?:(\\d+) hr)? ?(?:(\\d+) min)?\"\n", + " match = re.match(pattern, duration.strip())\n", + " if match:\n", + " hours = int(match.group(1)) if match.group(1) else 0\n", + " minutes = int(match.group(2)) if match.group(2) else 0\n", + " return timedelta(hours=hours, minutes=minutes)\n", + " return timedelta(0)\n", + "\n", + "\n", + "df[\"flight_duration_dt\"] = df[\"duration\"].apply(parse_duration)\n", "df[\"landing_origin_dt\"] = df[\"departure_dt\"] + df[\"flight_duration_dt\"]" ] }, @@ -110,6 +128,19 @@ "df[\"stops_str\"] = df[\"stops\"].apply(lambda x: x if isinstance(x, str) else \"No\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a9a4595", + "metadata": {}, + "outputs": [], + "source": [ + "# In current processing, if the price is like 'CA$ 1,200', the algorithm will split this\n", + "# into digits and characters, so we'll get 'CA$,' and '1200'.\n", + "# This removes the unnecessary comma.\n", + "df[\"price_currency\"] = df[\"price_currency\"].str.replace(\",\", \"\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -319,28 +350,52 @@ { "cell_type": "code", "execution_count": null, - "id": "7b994d47-cb96-4518-a213-c13fcf045b75", + "id": "a98fd73e", "metadata": {}, "outputs": [], "source": [ - "px.scatter(\n", - " df_filtered,\n", - " y=\"price_unit\",\n", - " x=\"flight_duration_hours\",\n", - " color=\"direct_flight\",\n", - " marginal_y=\"violin\",\n", - " marginal_x=\"box\",\n", - ")" + "custom_data_columns = [\n", + " \"connection\",\n", + " \"departure_dt\",\n", + " \"landing\",\n", + " \"duration\",\n", + " \"stops_str\",\n", + " \"price_currency\",\n", + " \"price_unit\",\n", + "]\n", + "\n", + "# is used to omit the trace name.\n", + "hovertemplate = \"\"\"\n", + "%{customdata[0]}
\n", + "Departure: %{customdata[1]}
\n", + "Landing: %{customdata[2]}
\n", + "Duration: %{customdata[3]}
\n", + "Stops: %{customdata[4]}
\n", + "Price: %{customdata[5]} %{customdata[6]}\n", + "\n", + "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "ac201f84-90b7-432a-ad98-1d41a07acc3f", + "id": "7b994d47-cb96-4518-a213-c13fcf045b75", "metadata": {}, "outputs": [], "source": [ - "df_filtered.iloc[1]" + "fig = px.scatter(\n", + " df_filtered,\n", + " y=\"price_unit\",\n", + " x=\"flight_duration_hours\",\n", + " color=\"direct_flight\",\n", + " marginal_y=\"violin\",\n", + " marginal_x=\"box\",\n", + " custom_data=custom_data_columns,\n", + ")\n", + "\n", + "fig.update_traces(hovertemplate=hovertemplate)\n", + "\n", + "fig.show()" ] }, { @@ -356,25 +411,9 @@ " y=\"price_unit\",\n", " color=\"direct_flight\",\n", " opacity=0.5,\n", - " custom_data=[\n", - " \"connection\",\n", - " \"departure_dt\",\n", - " \"landing\",\n", - " \"duration\",\n", - " \"stops_str\",\n", - " \"price_currency\",\n", - " \"price_unit\",\n", - " ],\n", + " custom_data=custom_data_columns,\n", ")\n", "\n", - "hovertemplate = \"\"\"\n", - "%{customdata[0]}
\n", - "Departure: %{customdata[1]}
\n", - "Landing: %{customdata[2]}
\n", - "Duration: %{customdata[3]}
\n", - "Stops: %{customdata[4]}
\n", - "Price: %{customdata[5]} %{customdata[6]}\n", - "\"\"\"\n", "\n", "# TODO: add some visualisation for how good the price/duration is relative to others\n", "# e.g. how many % flights are below this duration/prict\n", @@ -385,24 +424,6 @@ "fig.show()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "86cb46d0-4f6d-41cd-ac10-ba6b20a3e6b5", - "metadata": {}, - "outputs": [], - "source": [ - "df_filtered[df_filtered[\"price_unit\"] < 850][\n", - " [\n", - " \"origin_airport_name\",\n", - " \"destination_airport_name\",\n", - " \"departure_date\",\n", - " \"return_date\",\n", - " \"airline\",\n", - " ]\n", - "]" - ] - }, { "cell_type": "code", "execution_count": null, @@ -410,25 +431,35 @@ "metadata": {}, "outputs": [], "source": [ - "df_sub = df_filtered # [df_filtered[\"departure_date_dt\"] == datetime(2024, 11, 20)]\n", - "line_colors = {True: \"#F44336\", False: \"#2196F3\"}\n", - "\n", - "fig = go.Figure()\n", - "for _, row in df_sub.iterrows():\n", - " fig.add_trace(\n", - " go.Scatter(\n", - " x=[row[\"departure_dt\"], row[\"landing_origin_dt\"]],\n", - " y=[row[\"price_unit\"], row[\"price_unit\"]],\n", - " line_color=line_colors[row[\"direct_flight\"]],\n", - " )\n", + "AXIS_OFFSET = 50\n", + "\n", + "price_global_min = df_filtered[\"price_unit\"].min() - AXIS_OFFSET\n", + "price_global_max = df_filtered[\"price_unit\"].max() + AXIS_OFFSET\n", + "\n", + "for departure_date in df_filtered[\"departure_date_dt\"].unique():\n", + " df_sub = df_filtered[df_filtered[\"departure_date_dt\"] == departure_date]\n", + " line_colors = {True: \"#F44336\", False: \"#2196F3\"}\n", + "\n", + " fig = go.Figure(\n", + " layout_yaxis_range=[price_global_min, price_global_max],\n", " )\n", + " for _, row in df_sub.iterrows():\n", + " fig.add_trace(\n", + " go.Scatter(\n", + " x=[row[\"departure_dt\"], row[\"landing_origin_dt\"]],\n", + " y=[row[\"price_unit\"], row[\"price_unit\"]],\n", + " customdata=[[row[col] for col in custom_data_columns]],\n", + " line_color=line_colors[row[\"direct_flight\"]],\n", + " hovertemplate=hovertemplate,\n", + " ),\n", + " )\n", "\n", - "fig.update_layout(\n", - " autosize=True,\n", - " height=1000,\n", - ")\n", + " fig.update_layout(\n", + " autosize=True,\n", + " height=1000,\n", + " )\n", "\n", - "fig.show()" + " fig.show()" ] }, { From 9306e0446181236be5ab88854459f711acdb3dcc Mon Sep 17 00:00:00 2001 From: Mateusz Krainski Date: Tue, 30 Jul 2024 21:52:28 -0400 Subject: [PATCH 2/2] Bump GPT model to 4o instead of 4o-mini; fix replace to use regex --- .github/workflows/ci.yml | 2 +- yafs/notebooks/analysis.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b688a71..6f5c16b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -76,7 +76,7 @@ jobs: with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - OPENAI_API_MODEL: "gpt-4o-mini" + OPENAI_API_MODEL: "gpt-4o" exclude: "**/*.json, **/*.md, **/*.lock" dependabot: diff --git a/yafs/notebooks/analysis.ipynb b/yafs/notebooks/analysis.ipynb index 5e37f8c..100daee 100644 --- a/yafs/notebooks/analysis.ipynb +++ b/yafs/notebooks/analysis.ipynb @@ -138,7 +138,7 @@ "# In current processing, if the price is like 'CA$ 1,200', the algorithm will split this\n", "# into digits and characters, so we'll get 'CA$,' and '1200'.\n", "# This removes the unnecessary comma.\n", - "df[\"price_currency\"] = df[\"price_currency\"].str.replace(\",\", \"\")" + "df[\"price_currency\"] = df[\"price_currency\"].str.replace(\",\", \"\", regex=False)" ] }, {