solve 'one billion row' challenge

This commit is contained in:
Ivan R. 2024-08-24 16:30:35 +05:00
parent 61b8efd211
commit 062c2d2483
Signed by: lumin
GPG key ID: E0937DC7CD6D3817
9 changed files with 159 additions and 0 deletions

View file

@ -0,0 +1,4 @@
# Used by "mix format"
[
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
]

28
one-billion-row/.gitignore vendored Normal file
View file

@ -0,0 +1,28 @@
# The directory Mix will write compiled artifacts to.
/_build/
# If you run "mix test --cover", coverage assets end up here.
/cover/
# The directory Mix downloads your dependencies sources to.
/deps/
# Where third-party dependencies like ExDoc output generated docs.
/doc/
# Ignore .fetch files in case you like to edit your project deps locally.
/.fetch
# If the VM crashes, it generates a dump, let's ignore it too.
erl_crash.dump
# Also ignore archive artifacts (built via "mix archive.build").
*.ez
# Ignore package tarball (built via "mix hex.build").
obr-*.tar
# Temporary files, for example, from tests.
/tmp/
measurements*.txt

View file

@ -0,0 +1,7 @@
# One billion row challenge
Naive solution with explorer.
```bash
mix obr
```

View file

@ -0,0 +1,55 @@
defmodule Obr do
@moduledoc """
One billion row challenge.
https://github.com/gunnarmorling/1brc
"""
require Explorer.DataFrame, as: DF
@doc """
Read the provided file and perform calculations.
"""
def process_file(filepath) do
filepath
|> DF.from_csv!(
header: false,
delimiter: ";",
eol_delimiter: "\n",
dtypes: [column_1: :string, column_2: :float]
)
|> process_dataframe()
|> format_results()
end
@doc """
Process the dataframe and return a list of tuples containing the station name, min, mean and max temperatures".
"""
def process_dataframe(df) do
df
|> DF.group_by("column_1")
|> DF.summarise(min: min(column_2), mean: mean(column_2), max: max(column_2))
|> DF.sort_by(column_1)
|> DF.select(["column_1", "min", "mean", "max"])
|> DF.to_rows_stream()
|> Enum.map(fn row -> {row["column_1"], row["min"], row["mean"], row["max"]} end)
end
def format_results(rows) do
str =
rows
|> Enum.map(&format_row/1)
|> Enum.join(", ")
"{#{str}}"
end
def format_row({name, min, mean, max}) do
temperatures =
[min, mean, max]
|> Enum.map(&Float.ceil(&1, 1))
|> Enum.join("/")
"#{name}=#{temperatures}"
end
end

View file

@ -0,0 +1,9 @@
defmodule Mix.Tasks.Obr do
use Mix.Task
def run(_) do
{time, res} = :timer.tc(&Obr.process_file/1, ["measurements.txt"], :second)
IO.puts(res)
IO.puts("Time elapsed: #{time} seconds")
end
end

29
one-billion-row/mix.exs Normal file
View file

@ -0,0 +1,29 @@
defmodule Obr.MixProject do
use Mix.Project
def project do
[
app: :obr,
version: "0.1.0",
elixir: "~> 1.16",
start_permanent: Mix.env() == :prod,
deps: deps()
]
end
# Run "mix help compile.app" to learn about applications.
def application do
[
extra_applications: [:logger]
]
end
# Run "mix help deps" to learn about dependencies.
defp deps do
[
{:explorer, "~> 0.9.0"}
# {:dep_from_hexpm, "~> 0.3.0"},
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"}
]
end
end

10
one-billion-row/mix.lock Normal file
View file

@ -0,0 +1,10 @@
%{
"aws_signature": {:hex, :aws_signature, "0.3.2", "adf33bc4af00b2089b7708bf20e3246f09c639a905a619b3689f0a0a22c3ef8f", [:rebar3], [], "hexpm", "b0daf61feb4250a8ab0adea60db3e336af732ff71dd3fb22e45ae3dcbd071e44"},
"castore": {:hex, :castore, "1.0.8", "dedcf20ea746694647f883590b82d9e96014057aff1d44d03ec90f36a5c0dc6e", [:mix], [], "hexpm", "0b2b66d2ee742cb1d9cb8c8be3b43c3a70ee8651f37b75a8b982e036752983f1"},
"emmap": {:hex, :emmap, "2.0.11", "aec85ae663998329cf4343ef8d3f6b49fc1b054362e6d2689ef3314f7f36d351", [:rebar3], [], "hexpm", "10789b911658c672c62a979f34eb8d1fc3f8237a367f7d5dc3dbb3e1ebefd1c4"},
"explorer": {:hex, :explorer, "0.9.2", "a9598eeff8d36d88f643d14818bea1869ca70c4def61bfba22f040ee315b84b6", [:mix], [{:adbc, "~> 0.1", [hex: :adbc, repo: "hexpm", optional: true]}, {:aws_signature, "~> 0.3", [hex: :aws_signature, repo: "hexpm", optional: false]}, {:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:flame, "~> 0.3", [hex: :flame, repo: "hexpm", optional: true]}, {:fss, "~> 0.1", [hex: :fss, repo: "hexpm", optional: false]}, {:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: true]}, {:rustler, "~> 0.34.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}, {:table, "~> 0.1.2", [hex: :table, repo: "hexpm", optional: false]}, {:table_rex, "~> 3.1.1 or ~> 4.0.0", [hex: :table_rex, repo: "hexpm", optional: false]}], "hexpm", "63057e318d613c1819bd8bee2d8ed4f7061c3136edc6832ad18243d28e6344eb"},
"fss": {:hex, :fss, "0.1.1", "9db2344dbbb5d555ce442ac7c2f82dd975b605b50d169314a20f08ed21e08642", [:mix], [], "hexpm", "78ad5955c7919c3764065b21144913df7515d52e228c09427a004afe9c1a16b0"},
"rustler_precompiled": {:hex, :rustler_precompiled, "0.8.0", "02d218b575d8175e80138557f46bee7af5598f29e9aff8935a6c369c0e6c47a5", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "00b1711d8d828200fe931e23bb0e72c2672a3a0ef76740e3c50433afda1965fb"},
"table": {:hex, :table, "0.1.2", "87ad1125f5b70c5dea0307aa633194083eb5182ec537efc94e96af08937e14a8", [:mix], [], "hexpm", "7e99bc7efef806315c7e65640724bf165c3061cdc5d854060f74468367065029"},
"table_rex": {:hex, :table_rex, "4.0.0", "3c613a68ebdc6d4d1e731bc973c233500974ec3993c99fcdabb210407b90959b", [:mix], [], "hexpm", "c35c4d5612ca49ebb0344ea10387da4d2afe278387d4019e4d8111e815df8f55"},
}

View file

@ -0,0 +1,16 @@
defmodule ObrTest do
use ExUnit.Case
doctest Obr
test "dataframe processing is correct" do
df = Explorer.DataFrame.new(column_1: ["c", "b", "c", "c"], column_2: [-6.0, 3.0, 1.2, 2.0])
res = Obr.process_dataframe(df)
assert res == [{"b", 3.0, 3.0, 3.0}, {"c", -6.0, -2.8 / 3, 2.0}]
end
test "formatting is correct" do
src = [{"b", 3.0, 2.91, 4.0}, {"c", -6.0, -2.8 / 3, 2.0}]
str = Obr.format_results(src)
assert str == "{b=3.0/3.0/4.0, c=-6.0/-0.9/2.0}"
end
end

View file

@ -0,0 +1 @@
ExUnit.start()