my-solutions/one-billion-row/lib/obr.ex

56 lines
1.2 KiB
Elixir

defmodule Obr do
@moduledoc """
One billion row challenge.
https://github.com/gunnarmorling/1brc
"""
require Explorer.DataFrame, as: DF
@doc """
Read the provided file and perform calculations.
"""
def process_file(filepath) do
filepath
|> DF.from_csv!(
header: false,
delimiter: ";",
eol_delimiter: "\n",
dtypes: [column_1: :string, column_2: :float]
)
|> process_dataframe()
|> format_results()
end
@doc """
Process the dataframe and return a list of tuples containing the station name, min, mean and max temperatures".
"""
def process_dataframe(df) do
df
|> DF.group_by("column_1")
|> DF.summarise(min: min(column_2), mean: mean(column_2), max: max(column_2))
|> DF.sort_by(column_1)
|> DF.select(["column_1", "min", "mean", "max"])
|> DF.to_rows_stream()
|> Enum.map(fn row -> {row["column_1"], row["min"], row["mean"], row["max"]} end)
end
def format_results(rows) do
str =
rows
|> Enum.map(&format_row/1)
|> Enum.join(", ")
"{#{str}}"
end
def format_row({name, min, mean, max}) do
temperatures =
[min, mean, max]
|> Enum.map(&Float.ceil(&1, 1))
|> Enum.join("/")
"#{name}=#{temperatures}"
end
end