solve 'one billion row' challenge
This commit is contained in:
parent
61b8efd211
commit
062c2d2483
9 changed files with 159 additions and 0 deletions
4
one-billion-row/.formatter.exs
Normal file
4
one-billion-row/.formatter.exs
Normal file
|
@ -0,0 +1,4 @@
|
|||
# Used by "mix format"
|
||||
[
|
||||
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
|
||||
]
|
28
one-billion-row/.gitignore
vendored
Normal file
28
one-billion-row/.gitignore
vendored
Normal file
|
@ -0,0 +1,28 @@
|
|||
# The directory Mix will write compiled artifacts to.
|
||||
/_build/
|
||||
|
||||
# If you run "mix test --cover", coverage assets end up here.
|
||||
/cover/
|
||||
|
||||
# The directory Mix downloads your dependencies sources to.
|
||||
/deps/
|
||||
|
||||
# Where third-party dependencies like ExDoc output generated docs.
|
||||
/doc/
|
||||
|
||||
# Ignore .fetch files in case you like to edit your project deps locally.
|
||||
/.fetch
|
||||
|
||||
# If the VM crashes, it generates a dump, let's ignore it too.
|
||||
erl_crash.dump
|
||||
|
||||
# Also ignore archive artifacts (built via "mix archive.build").
|
||||
*.ez
|
||||
|
||||
# Ignore package tarball (built via "mix hex.build").
|
||||
obr-*.tar
|
||||
|
||||
# Temporary files, for example, from tests.
|
||||
/tmp/
|
||||
|
||||
measurements*.txt
|
7
one-billion-row/README.md
Normal file
7
one-billion-row/README.md
Normal file
|
@ -0,0 +1,7 @@
|
|||
# One billion row challenge
|
||||
|
||||
Naive solution with explorer.
|
||||
|
||||
```bash
|
||||
mix obr
|
||||
```
|
55
one-billion-row/lib/obr.ex
Normal file
55
one-billion-row/lib/obr.ex
Normal file
|
@ -0,0 +1,55 @@
|
|||
defmodule Obr do
|
||||
@moduledoc """
|
||||
One billion row challenge.
|
||||
|
||||
https://github.com/gunnarmorling/1brc
|
||||
"""
|
||||
|
||||
require Explorer.DataFrame, as: DF
|
||||
|
||||
@doc """
|
||||
Read the provided file and perform calculations.
|
||||
"""
|
||||
def process_file(filepath) do
|
||||
filepath
|
||||
|> DF.from_csv!(
|
||||
header: false,
|
||||
delimiter: ";",
|
||||
eol_delimiter: "\n",
|
||||
dtypes: [column_1: :string, column_2: :float]
|
||||
)
|
||||
|> process_dataframe()
|
||||
|> format_results()
|
||||
end
|
||||
|
||||
@doc """
|
||||
Process the dataframe and return a list of tuples containing the station name, min, mean and max temperatures".
|
||||
"""
|
||||
def process_dataframe(df) do
|
||||
df
|
||||
|> DF.group_by("column_1")
|
||||
|> DF.summarise(min: min(column_2), mean: mean(column_2), max: max(column_2))
|
||||
|> DF.sort_by(column_1)
|
||||
|> DF.select(["column_1", "min", "mean", "max"])
|
||||
|> DF.to_rows_stream()
|
||||
|> Enum.map(fn row -> {row["column_1"], row["min"], row["mean"], row["max"]} end)
|
||||
end
|
||||
|
||||
def format_results(rows) do
|
||||
str =
|
||||
rows
|
||||
|> Enum.map(&format_row/1)
|
||||
|> Enum.join(", ")
|
||||
|
||||
"{#{str}}"
|
||||
end
|
||||
|
||||
def format_row({name, min, mean, max}) do
|
||||
temperatures =
|
||||
[min, mean, max]
|
||||
|> Enum.map(&Float.ceil(&1, 1))
|
||||
|> Enum.join("/")
|
||||
|
||||
"#{name}=#{temperatures}"
|
||||
end
|
||||
end
|
9
one-billion-row/lib/task.ex
Normal file
9
one-billion-row/lib/task.ex
Normal file
|
@ -0,0 +1,9 @@
|
|||
defmodule Mix.Tasks.Obr do
|
||||
use Mix.Task
|
||||
|
||||
def run(_) do
|
||||
{time, res} = :timer.tc(&Obr.process_file/1, ["measurements.txt"], :second)
|
||||
IO.puts(res)
|
||||
IO.puts("Time elapsed: #{time} seconds")
|
||||
end
|
||||
end
|
29
one-billion-row/mix.exs
Normal file
29
one-billion-row/mix.exs
Normal file
|
@ -0,0 +1,29 @@
|
|||
defmodule Obr.MixProject do
|
||||
use Mix.Project
|
||||
|
||||
def project do
|
||||
[
|
||||
app: :obr,
|
||||
version: "0.1.0",
|
||||
elixir: "~> 1.16",
|
||||
start_permanent: Mix.env() == :prod,
|
||||
deps: deps()
|
||||
]
|
||||
end
|
||||
|
||||
# Run "mix help compile.app" to learn about applications.
|
||||
def application do
|
||||
[
|
||||
extra_applications: [:logger]
|
||||
]
|
||||
end
|
||||
|
||||
# Run "mix help deps" to learn about dependencies.
|
||||
defp deps do
|
||||
[
|
||||
{:explorer, "~> 0.9.0"}
|
||||
# {:dep_from_hexpm, "~> 0.3.0"},
|
||||
# {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"}
|
||||
]
|
||||
end
|
||||
end
|
10
one-billion-row/mix.lock
Normal file
10
one-billion-row/mix.lock
Normal file
|
@ -0,0 +1,10 @@
|
|||
%{
|
||||
"aws_signature": {:hex, :aws_signature, "0.3.2", "adf33bc4af00b2089b7708bf20e3246f09c639a905a619b3689f0a0a22c3ef8f", [:rebar3], [], "hexpm", "b0daf61feb4250a8ab0adea60db3e336af732ff71dd3fb22e45ae3dcbd071e44"},
|
||||
"castore": {:hex, :castore, "1.0.8", "dedcf20ea746694647f883590b82d9e96014057aff1d44d03ec90f36a5c0dc6e", [:mix], [], "hexpm", "0b2b66d2ee742cb1d9cb8c8be3b43c3a70ee8651f37b75a8b982e036752983f1"},
|
||||
"emmap": {:hex, :emmap, "2.0.11", "aec85ae663998329cf4343ef8d3f6b49fc1b054362e6d2689ef3314f7f36d351", [:rebar3], [], "hexpm", "10789b911658c672c62a979f34eb8d1fc3f8237a367f7d5dc3dbb3e1ebefd1c4"},
|
||||
"explorer": {:hex, :explorer, "0.9.2", "a9598eeff8d36d88f643d14818bea1869ca70c4def61bfba22f040ee315b84b6", [:mix], [{:adbc, "~> 0.1", [hex: :adbc, repo: "hexpm", optional: true]}, {:aws_signature, "~> 0.3", [hex: :aws_signature, repo: "hexpm", optional: false]}, {:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:flame, "~> 0.3", [hex: :flame, repo: "hexpm", optional: true]}, {:fss, "~> 0.1", [hex: :fss, repo: "hexpm", optional: false]}, {:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: true]}, {:rustler, "~> 0.34.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}, {:table, "~> 0.1.2", [hex: :table, repo: "hexpm", optional: false]}, {:table_rex, "~> 3.1.1 or ~> 4.0.0", [hex: :table_rex, repo: "hexpm", optional: false]}], "hexpm", "63057e318d613c1819bd8bee2d8ed4f7061c3136edc6832ad18243d28e6344eb"},
|
||||
"fss": {:hex, :fss, "0.1.1", "9db2344dbbb5d555ce442ac7c2f82dd975b605b50d169314a20f08ed21e08642", [:mix], [], "hexpm", "78ad5955c7919c3764065b21144913df7515d52e228c09427a004afe9c1a16b0"},
|
||||
"rustler_precompiled": {:hex, :rustler_precompiled, "0.8.0", "02d218b575d8175e80138557f46bee7af5598f29e9aff8935a6c369c0e6c47a5", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "00b1711d8d828200fe931e23bb0e72c2672a3a0ef76740e3c50433afda1965fb"},
|
||||
"table": {:hex, :table, "0.1.2", "87ad1125f5b70c5dea0307aa633194083eb5182ec537efc94e96af08937e14a8", [:mix], [], "hexpm", "7e99bc7efef806315c7e65640724bf165c3061cdc5d854060f74468367065029"},
|
||||
"table_rex": {:hex, :table_rex, "4.0.0", "3c613a68ebdc6d4d1e731bc973c233500974ec3993c99fcdabb210407b90959b", [:mix], [], "hexpm", "c35c4d5612ca49ebb0344ea10387da4d2afe278387d4019e4d8111e815df8f55"},
|
||||
}
|
16
one-billion-row/test/obr_test.exs
Normal file
16
one-billion-row/test/obr_test.exs
Normal file
|
@ -0,0 +1,16 @@
|
|||
defmodule ObrTest do
|
||||
use ExUnit.Case
|
||||
doctest Obr
|
||||
|
||||
test "dataframe processing is correct" do
|
||||
df = Explorer.DataFrame.new(column_1: ["c", "b", "c", "c"], column_2: [-6.0, 3.0, 1.2, 2.0])
|
||||
res = Obr.process_dataframe(df)
|
||||
assert res == [{"b", 3.0, 3.0, 3.0}, {"c", -6.0, -2.8 / 3, 2.0}]
|
||||
end
|
||||
|
||||
test "formatting is correct" do
|
||||
src = [{"b", 3.0, 2.91, 4.0}, {"c", -6.0, -2.8 / 3, 2.0}]
|
||||
str = Obr.format_results(src)
|
||||
assert str == "{b=3.0/3.0/4.0, c=-6.0/-0.9/2.0}"
|
||||
end
|
||||
end
|
1
one-billion-row/test/test_helper.exs
Normal file
1
one-billion-row/test/test_helper.exs
Normal file
|
@ -0,0 +1 @@
|
|||
ExUnit.start()
|
Loading…
Reference in a new issue