Skip to content

Commit

Permalink
Calcule et stocke un score de conformité (#3622)
Browse files Browse the repository at this point in the history
* Calcule et stocke un score de conformité

* format

* Update test

* Add comments

* PR comments

* Add extra word

* Refactor

* Fix failing test case
  • Loading branch information
AntoineAugusti authored Nov 24, 2023
1 parent bafe2c2 commit d0b8c35
Show file tree
Hide file tree
Showing 5 changed files with 298 additions and 71 deletions.
2 changes: 1 addition & 1 deletion apps/transport/lib/db/dataset_score.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ defmodule DB.DatasetScore do

typed_schema "dataset_score" do
belongs_to(:dataset, DB.Dataset)
field(:topic, Ecto.Enum, values: [:freshness, :availability])
field(:topic, Ecto.Enum, values: [:freshness, :availability, :compliance])
field(:score, :float)
field(:timestamp, :utc_datetime_usec)
field(:details, :map)
Expand Down
116 changes: 91 additions & 25 deletions apps/transport/lib/jobs/dataset_quality_score.ex
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ defmodule Transport.Jobs.DatasetQualityScore do

@impl Oban.Worker
def perform(%Oban.Job{args: %{"dataset_id" => dataset_id}}) do
Transport.Jobs.DatasetFreshnessScore.save_freshness_score(dataset_id)
Transport.Jobs.DatasetAvailabilityScore.save_availability_score(dataset_id)
:ok
DB.DatasetScore
|> Ecto.Enum.values(:topic)
|> Enum.each(fn topic -> save_dataset_score(dataset_id, topic) end)
end

@doc """
Expand Down Expand Up @@ -63,12 +63,23 @@ defmodule Transport.Jobs.DatasetQualityScore do
@doc """
Exponential smoothing. See https://en.wikipedia.org/wiki/Exponential_smoothing
iex> exp_smoothing(0.5, 1)
iex> exp_smoothing(0.5, 1, :freshness)
0.55
iex> exp_smoothing(0.5, 1, 0.9)
0.55
iex> exp_smoothing(0.5, 1, :compliance)
0.525
"""
@spec exp_smoothing(float, float) :: float
def exp_smoothing(previous_score, today_score) do
alpha = 0.9
@spec exp_smoothing(float(), float(), atom() | float()) :: float()
def exp_smoothing(previous_score, today_score, :compliance) do
exp_smoothing(previous_score, today_score, 0.95)
end

def exp_smoothing(previous_score, today_score, topic) when topic in [:freshness, :availability] do
exp_smoothing(previous_score, today_score, 0.9)
end

def exp_smoothing(previous_score, today_score, alpha) do
alpha * previous_score + (1.0 - alpha) * today_score
end

Expand Down Expand Up @@ -146,7 +157,7 @@ defmodule Transport.Jobs.DatasetQualityScore do
computed_score =
case last_score = last_dataset_score(dataset_id, topic) do
%{score: previous_score} when is_float(previous_score) ->
exp_smoothing(previous_score, today_score)
exp_smoothing(previous_score, today_score, topic)

_ ->
today_score
Expand All @@ -164,38 +175,98 @@ defmodule Transport.Jobs.DatasetQualityScore do
Map.fetch!(
%{
availability: &Transport.Jobs.DatasetAvailabilityScore.current_dataset_availability/1,
freshness: &Transport.Jobs.DatasetFreshnessScore.current_dataset_freshness/1
freshness: &Transport.Jobs.DatasetFreshnessScore.current_dataset_freshness/1,
compliance: &Transport.Jobs.DatasetComplianceScore.current_dataset_compliance/1
},
topic
)
end
end

defmodule Transport.Jobs.DatasetAvailabilityScore do
defmodule Transport.Jobs.DatasetComplianceScore do
@moduledoc """
Methods specific to the availability component of a dataset score.
Methods specific to the compliance component of a dataset score.
Computes and saves a compliance score for a dataset.
To compute this score:
- get the dataset's current resources
- for each resource we validated using a list of validators (`@validators`),
give it a score (1 if it's valid, 0 if it has an error)
- we compute an average of those scores to get a score at the dataset level
- that score is averaged with the dataset's last computed score, using exponential smoothing
(see the function `exp_smoothing/3`). This allows a score to reflect not only the current
dataset situation but also past situations.
"""
import Ecto.Query
import Transport.Jobs.DatasetQualityScore
alias Transport.Jobs.DatasetQualityScore

@validators_with_has_errors [
Transport.Validators.TableSchema,
Transport.Validators.EXJSONSchema,
Transport.Validators.GBFSValidator
]
@gtfs_validator Transport.Validators.GTFSTransport
@validators [@gtfs_validator | @validators_with_has_errors]
@validators_with_has_errors_names Enum.map(@validators_with_has_errors, & &1.validator_name())
@gtfs_validator_name @gtfs_validator.validator_name()

@spec current_dataset_compliance(integer()) :: %{score: float | nil, details: map()}
def current_dataset_compliance(dataset_id) do
validation_details =
dataset_id
|> DB.MultiValidation.dataset_latest_validation(@validators)
|> Enum.reject(fn {_resource_id, [multi_validation]} -> is_nil(multi_validation) end)

current_dataset_infos = Enum.map(validation_details, &resource_compliance(&1))

score =
current_dataset_infos |> Enum.map(fn %{compliance: compliance} -> compliance end) |> DatasetQualityScore.average()

%{score: score, details: %{resources: current_dataset_infos}}
end

@spec resource_compliance({integer(), [DB.MultiValidation.t()]}) :: %{
:compliance => float(),
:resource_id => integer(),
:raw_measure => map()
}
# Works for TableSchema + JSON Schema and GBFS
def resource_compliance(
{resource_id, [%DB.MultiValidation{validator: validator, result: %{"has_errors" => has_errors} = result}]}
)
when validator in @validators_with_has_errors_names do
compliance = if has_errors, do: 0.0, else: 1.0
%{compliance: compliance, resource_id: resource_id, raw_measure: result}
end

# For GTFS resources
def resource_compliance({resource_id, [%DB.MultiValidation{validator: @gtfs_validator_name, max_error: max_error}]}) do
compliance = if max_error in ["Fatal", "Error"], do: 0.0, else: 1.0
%{compliance: compliance, resource_id: resource_id, raw_measure: %{"max_error" => max_error}}
end
end

defmodule Transport.Jobs.DatasetAvailabilityScore do
@moduledoc """
Methods specific to the availability component of a dataset score.
@doc """
Saves and computes an availability score for a dataset.
To compute this score:
- get the dataset's current resources
- for each resource, give it a score based on its availability over the last 24 hours
- we compute an average of those scores to get a score at the dataset level
- that score is averaged with the dataset's last computed score, using exponential smoothing
(see the function `exp_smoothing/1` below). This allows a score to reflect not only the current
(see the function `exp_smoothing/3`). This allows a score to reflect not only the current
dataset situation but also past situations.
If any resource as an availability score of 0 (under 95% of availability over the last 24 hours),
the availability score of the dataset will be 0.
The rationale is that the entire dataset may be unusable if a single resource cannot be fetched.
"""
def save_availability_score(dataset_id) do
save_dataset_score(dataset_id, :availability)
end
import Ecto.Query
import Transport.Jobs.DatasetQualityScore

@spec current_dataset_availability(integer()) :: %{score: float | nil, details: map()}
def current_dataset_availability(dataset_id) do
Expand Down Expand Up @@ -279,27 +350,22 @@ end
defmodule Transport.Jobs.DatasetFreshnessScore do
@moduledoc """
Methods specific to the freshness component of a dataset score.
"""
import Ecto.Query
import Transport.Jobs.DatasetQualityScore
@doc """
Dataset "freshness" is the answer to the question: "When the data was downloaded, was it up-to-date?"
To give a score, we proceed this way:
- get the dataset's current resources
- for each resource, give it a score
- we compute an average of those scores to get a score at the dataset level
- that score is averaged with the dataset's last computed score, using exponential smoothing
(see the function `exp_smoothing/1`). This allows a score to reflect not only the current
(see the function `exp_smoothing/3`). This allows a score to reflect not only the current
dataset situation but also past situations. Typically, a dataset that had outdated resources
for the past year, but only up-to-date resources today is expected to have a low freshness score.
The interest of exponential smoothing is to give past scores an increasingly small weight as time
passes. To have a good score, a dataset must have up-to-date resources every day.
"""
def save_freshness_score(dataset_id) do
save_dataset_score(dataset_id, :freshness)
end
import Ecto.Query
import Transport.Jobs.DatasetQualityScore

@spec current_dataset_freshness(integer()) :: %{score: float | nil, details: map()}
def current_dataset_freshness(dataset_id) do
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,21 @@
<div :if={@dataset_scores != %{}} class="light-grey pt-6">
<% freshness_score = Map.get(@dataset_scores, :freshness) %>
<div>
<%= unless is_nil(freshness_score) do %>
Score fraicheur : <%= DB.DatasetScore.score_for_humans(freshness_score) %>
<span class="small">
<%= freshness_score.timestamp |> Shared.DateTimeDisplay.format_datetime_to_paris(@locale) %>
</span>
<% else %>
Pas de score fraicheur
<% end %>
</div>
<% availability_score = Map.get(@dataset_scores, :availability) %>
<div>
<%= unless is_nil(availability_score) do %>
Score de disponibilité : <%= DB.DatasetScore.score_for_humans(availability_score) %>
<span class="small">
<%= availability_score.timestamp |> Shared.DateTimeDisplay.format_datetime_to_paris(@locale) %>
</span>
<% else %>
Pas de score de disponibilité
<% end %>
</div>
<% components = [
{:freshness, "fraicheur"},
{:availability, "disponibilité"},
{:compliance, "conformité"}
] %>
<%= for {topic, description} <- components do %>
<div>
<% score = Map.get(@dataset_scores, topic) %>
<%= unless is_nil(score) do %>
Score de <%= description %> : <%= DB.DatasetScore.score_for_humans(score) %>
<span class="small">
<%= score.timestamp |> Shared.DateTimeDisplay.format_datetime_to_paris(@locale) %>
</span>
<% else %>
Pas de score de <%= description %>
<% end %>
</div>
<% end %>
<a href="#scores-chart">Voir plus</a>
</div>
Loading

0 comments on commit d0b8c35

Please sign in to comment.