my-solutions/one-billion-row/lib/obr.ex

defmodule Obr do
  @moduledoc """
  One billion row challenge.

  https://github.com/gunnarmorling/1brc
  """

  require Explorer.DataFrame, as: DF

  @doc """
  Read the provided file and perform calculations.
  """
  def process_file(filepath) do
    filepath
    |> DF.from_csv!(
      header: false,
      delimiter: ";",
      eol_delimiter: "\n",
      dtypes: [column_1: :string, column_2: :float]
    )
    |> process_dataframe()
    |> format_results()
  end

  @doc """
  Process the dataframe and return a list of tuples containing the station name, min, mean and max temperatures".
  """
  def process_dataframe(df) do
    df
    |> DF.group_by("column_1")
    |> DF.summarise(min: min(column_2), mean: mean(column_2), max: max(column_2))
    |> DF.sort_by(column_1)
    |> DF.select(["column_1", "min", "mean", "max"])
    |> DF.to_rows_stream()
    |> Enum.map(fn row -> {row["column_1"], row["min"], row["mean"], row["max"]} end)
  end

  def format_results(rows) do
    str =
      rows
      |> Enum.map(&format_row/1)
      |> Enum.join(", ")

    "{#{str}}"
  end

  def format_row({name, min, mean, max}) do
    temperatures =
      [min, mean, max]
      |> Enum.map(&Float.ceil(&1, 1))
      |> Enum.join("/")

    "#{name}=#{temperatures}"
  end
end