Calcule et stocke un score de conformité (#3622)

* Calcule et stocke un score de conformité * format * Update test * Add comments * PR comments * Add extra word * Refactor * Fix failing test case
etalab · Nov 24, 2023 · d0b8c35 · d0b8c35
1 parent bafe2c2
commit d0b8c35
Show file tree

Hide file tree

Showing 5 changed files with 298 additions and 71 deletions.
diff --git a/apps/transport/lib/db/dataset_score.ex b/apps/transport/lib/db/dataset_score.ex
@@ -9,7 +9,7 @@ defmodule DB.DatasetScore do
 
   typed_schema "dataset_score" do
     belongs_to(:dataset, DB.Dataset)
-    field(:topic, Ecto.Enum, values: [:freshness, :availability])
+    field(:topic, Ecto.Enum, values: [:freshness, :availability, :compliance])
     field(:score, :float)
     field(:timestamp, :utc_datetime_usec)
     field(:details, :map)

diff --git a/apps/transport/lib/jobs/dataset_quality_score.ex b/apps/transport/lib/jobs/dataset_quality_score.ex
@@ -28,9 +28,9 @@ defmodule Transport.Jobs.DatasetQualityScore do
 
   @impl Oban.Worker
   def perform(%Oban.Job{args: %{"dataset_id" => dataset_id}}) do
-    Transport.Jobs.DatasetFreshnessScore.save_freshness_score(dataset_id)
-    Transport.Jobs.DatasetAvailabilityScore.save_availability_score(dataset_id)
-    :ok
+    DB.DatasetScore
+    |> Ecto.Enum.values(:topic)
+    |> Enum.each(fn topic -> save_dataset_score(dataset_id, topic) end)
   end
 
   @doc """
@@ -63,12 +63,23 @@ defmodule Transport.Jobs.DatasetQualityScore do
   @doc """
   Exponential smoothing. See https://en.wikipedia.org/wiki/Exponential_smoothing
 
-  iex> exp_smoothing(0.5, 1)
+  iex> exp_smoothing(0.5, 1, :freshness)
+  0.55
+  iex> exp_smoothing(0.5, 1, 0.9)
   0.55
+  iex> exp_smoothing(0.5, 1, :compliance)
+  0.525
   """
-  @spec exp_smoothing(float, float) :: float
-  def exp_smoothing(previous_score, today_score) do
-    alpha = 0.9
+  @spec exp_smoothing(float(), float(), atom() | float()) :: float()
+  def exp_smoothing(previous_score, today_score, :compliance) do
+    exp_smoothing(previous_score, today_score, 0.95)
+  end
+
+  def exp_smoothing(previous_score, today_score, topic) when topic in [:freshness, :availability] do
+    exp_smoothing(previous_score, today_score, 0.9)
+  end
+
+  def exp_smoothing(previous_score, today_score, alpha) do
     alpha * previous_score + (1.0 - alpha) * today_score
   end
 
@@ -146,7 +157,7 @@ defmodule Transport.Jobs.DatasetQualityScore do
     computed_score =
       case last_score = last_dataset_score(dataset_id, topic) do
         %{score: previous_score} when is_float(previous_score) ->
-          exp_smoothing(previous_score, today_score)
+          exp_smoothing(previous_score, today_score, topic)
 
         _ ->
           today_score
@@ -164,38 +175,98 @@ defmodule Transport.Jobs.DatasetQualityScore do
     Map.fetch!(
       %{
         availability: &Transport.Jobs.DatasetAvailabilityScore.current_dataset_availability/1,
-        freshness: &Transport.Jobs.DatasetFreshnessScore.current_dataset_freshness/1
+        freshness: &Transport.Jobs.DatasetFreshnessScore.current_dataset_freshness/1,
+        compliance: &Transport.Jobs.DatasetComplianceScore.current_dataset_compliance/1
       },
       topic
     )
   end
 end
 
-defmodule Transport.Jobs.DatasetAvailabilityScore do
+defmodule Transport.Jobs.DatasetComplianceScore do
   @moduledoc """
-  Methods specific to the availability component of a dataset score.
+  Methods specific to the compliance component of a dataset score.
+
+  Computes and saves a compliance score for a dataset.
+
+  To compute this score:
+  - get the dataset's current resources
+  - for each resource we validated using a list of validators (`@validators`),
+    give it a score (1 if it's valid, 0 if it has an error)
+  - we compute an average of those scores to get a score at the dataset level
+   - that score is averaged with the dataset's last computed score, using exponential smoothing
+  (see the function `exp_smoothing/3`). This allows a score to reflect not only the current
+  dataset situation but also past situations.
   """
   import Ecto.Query
-  import Transport.Jobs.DatasetQualityScore
+  alias Transport.Jobs.DatasetQualityScore
+
+  @validators_with_has_errors [
+    Transport.Validators.TableSchema,
+    Transport.Validators.EXJSONSchema,
+    Transport.Validators.GBFSValidator
+  ]
+  @gtfs_validator Transport.Validators.GTFSTransport
+  @validators [@gtfs_validator | @validators_with_has_errors]
+  @validators_with_has_errors_names Enum.map(@validators_with_has_errors, & &1.validator_name())
+  @gtfs_validator_name @gtfs_validator.validator_name()
+
+  @spec current_dataset_compliance(integer()) :: %{score: float | nil, details: map()}
+  def current_dataset_compliance(dataset_id) do
+    validation_details =
+      dataset_id
+      |> DB.MultiValidation.dataset_latest_validation(@validators)
+      |> Enum.reject(fn {_resource_id, [multi_validation]} -> is_nil(multi_validation) end)
+
+    current_dataset_infos = Enum.map(validation_details, &resource_compliance(&1))
+
+    score =
+      current_dataset_infos |> Enum.map(fn %{compliance: compliance} -> compliance end) |> DatasetQualityScore.average()
+
+    %{score: score, details: %{resources: current_dataset_infos}}
+  end
+
+  @spec resource_compliance({integer(), [DB.MultiValidation.t()]}) :: %{
+          :compliance => float(),
+          :resource_id => integer(),
+          :raw_measure => map()
+        }
+  # Works for TableSchema + JSON Schema and GBFS
+  def resource_compliance(
+        {resource_id, [%DB.MultiValidation{validator: validator, result: %{"has_errors" => has_errors} = result}]}
+      )
+      when validator in @validators_with_has_errors_names do
+    compliance = if has_errors, do: 0.0, else: 1.0
+    %{compliance: compliance, resource_id: resource_id, raw_measure: result}
+  end
+
+  # For GTFS resources
+  def resource_compliance({resource_id, [%DB.MultiValidation{validator: @gtfs_validator_name, max_error: max_error}]}) do
+    compliance = if max_error in ["Fatal", "Error"], do: 0.0, else: 1.0
+    %{compliance: compliance, resource_id: resource_id, raw_measure: %{"max_error" => max_error}}
+  end
+end
+
+defmodule Transport.Jobs.DatasetAvailabilityScore do
+  @moduledoc """
+  Methods specific to the availability component of a dataset score.
 
-  @doc """
   Saves and computes an availability score for a dataset.
 
   To compute this score:
   - get the dataset's current resources
   - for each resource, give it a score based on its availability over the last 24 hours
   - we compute an average of those scores to get a score at the dataset level
    - that score is averaged with the dataset's last computed score, using exponential smoothing
-  (see the function `exp_smoothing/1` below). This allows a score to reflect not only the current
+  (see the function `exp_smoothing/3`). This allows a score to reflect not only the current
   dataset situation but also past situations.
 
   If any resource as an availability score of 0 (under 95% of availability over the last 24 hours),
   the availability score of the dataset will be 0.
   The rationale is that the entire dataset may be unusable if a single resource cannot be fetched.
   """
-  def save_availability_score(dataset_id) do
-    save_dataset_score(dataset_id, :availability)
-  end
+  import Ecto.Query
+  import Transport.Jobs.DatasetQualityScore
 
   @spec current_dataset_availability(integer()) :: %{score: float | nil, details: map()}
   def current_dataset_availability(dataset_id) do
@@ -279,27 +350,22 @@ end
 defmodule Transport.Jobs.DatasetFreshnessScore do
   @moduledoc """
   Methods specific to the freshness component of a dataset score.
-  """
-  import Ecto.Query
-  import Transport.Jobs.DatasetQualityScore
 
-  @doc """
   Dataset "freshness" is the answer to the question: "When the data was downloaded, was it up-to-date?"
 
   To give a score, we proceed this way:
   - get the dataset's current resources
   - for each resource, give it a score
   - we compute an average of those scores to get a score at the dataset level
   - that score is averaged with the dataset's last computed score, using exponential smoothing
-  (see the function `exp_smoothing/1`). This allows a score to reflect not only the current
+  (see the function `exp_smoothing/3`). This allows a score to reflect not only the current
   dataset situation but also past situations. Typically, a dataset that had outdated resources
   for the past year, but only up-to-date resources today is expected to have a low freshness score.
   The interest of exponential smoothing is to give past scores an increasingly small weight as time
   passes. To have a good score, a dataset must have up-to-date resources every day.
   """
-  def save_freshness_score(dataset_id) do
-    save_dataset_score(dataset_id, :freshness)
-  end
+  import Ecto.Query
+  import Transport.Jobs.DatasetQualityScore
 
   @spec current_dataset_freshness(integer()) :: %{score: float | nil, details: map()}
   def current_dataset_freshness(dataset_id) do

diff --git a/apps/transport/lib/transport_web/templates/dataset/_dataset_scores.html.heex b/apps/transport/lib/transport_web/templates/dataset/_dataset_scores.html.heex
@@ -1,25 +1,21 @@
 <div :if={@dataset_scores != %{}} class="light-grey pt-6">
-  <% freshness_score = Map.get(@dataset_scores, :freshness) %>
-  <div>
-    <%= unless is_nil(freshness_score) do %>
-      Score fraicheur : <%= DB.DatasetScore.score_for_humans(freshness_score) %>
-      <span class="small">
-        <%= freshness_score.timestamp |> Shared.DateTimeDisplay.format_datetime_to_paris(@locale) %>
-      </span>
-    <% else %>
-      Pas de score fraicheur
-    <% end %>
-  </div>
-  <% availability_score = Map.get(@dataset_scores, :availability) %>
-  <div>
-    <%= unless is_nil(availability_score) do %>
-      Score de disponibilité : <%= DB.DatasetScore.score_for_humans(availability_score) %>
-      <span class="small">
-        <%= availability_score.timestamp |> Shared.DateTimeDisplay.format_datetime_to_paris(@locale) %>
-      </span>
-    <% else %>
-      Pas de score de disponibilité
-    <% end %>
-  </div>
+  <% components = [
+    {:freshness, "fraicheur"},
+    {:availability, "disponibilité"},
+    {:compliance, "conformité"}
+  ] %>
+  <%= for {topic, description} <- components do %>
+    <div>
+      <% score = Map.get(@dataset_scores, topic) %>
+      <%= unless is_nil(score) do %>
+        Score de <%= description %> : <%= DB.DatasetScore.score_for_humans(score) %>
+        <span class="small">
+          <%= score.timestamp |> Shared.DateTimeDisplay.format_datetime_to_paris(@locale) %>
+        </span>
+      <% else %>
+        Pas de score de <%= description %>
+      <% end %>
+    </div>
+  <% end %>
   <a href="#scores-chart">Voir plus</a>
 </div>