From 062c2d24838afb9039c25ce79c900a2d6cdcbc91 Mon Sep 17 00:00:00 2001 From: Ivan Reshetnikov Date: Sat, 24 Aug 2024 16:30:35 +0500 Subject: [PATCH] solve 'one billion row' challenge --- one-billion-row/.formatter.exs | 4 ++ one-billion-row/.gitignore | 28 ++++++++++++++ one-billion-row/README.md | 7 ++++ one-billion-row/lib/obr.ex | 55 ++++++++++++++++++++++++++++ one-billion-row/lib/task.ex | 9 +++++ one-billion-row/mix.exs | 29 +++++++++++++++ one-billion-row/mix.lock | 10 +++++ one-billion-row/test/obr_test.exs | 16 ++++++++ one-billion-row/test/test_helper.exs | 1 + 9 files changed, 159 insertions(+) create mode 100644 one-billion-row/.formatter.exs create mode 100644 one-billion-row/.gitignore create mode 100644 one-billion-row/README.md create mode 100644 one-billion-row/lib/obr.ex create mode 100644 one-billion-row/lib/task.ex create mode 100644 one-billion-row/mix.exs create mode 100644 one-billion-row/mix.lock create mode 100644 one-billion-row/test/obr_test.exs create mode 100644 one-billion-row/test/test_helper.exs diff --git a/one-billion-row/.formatter.exs b/one-billion-row/.formatter.exs new file mode 100644 index 0000000..d2cda26 --- /dev/null +++ b/one-billion-row/.formatter.exs @@ -0,0 +1,4 @@ +# Used by "mix format" +[ + inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] +] diff --git a/one-billion-row/.gitignore b/one-billion-row/.gitignore new file mode 100644 index 0000000..5ecbb16 --- /dev/null +++ b/one-billion-row/.gitignore @@ -0,0 +1,28 @@ +# The directory Mix will write compiled artifacts to. +/_build/ + +# If you run "mix test --cover", coverage assets end up here. +/cover/ + +# The directory Mix downloads your dependencies sources to. +/deps/ + +# Where third-party dependencies like ExDoc output generated docs. +/doc/ + +# Ignore .fetch files in case you like to edit your project deps locally. +/.fetch + +# If the VM crashes, it generates a dump, let's ignore it too. +erl_crash.dump + +# Also ignore archive artifacts (built via "mix archive.build"). +*.ez + +# Ignore package tarball (built via "mix hex.build"). +obr-*.tar + +# Temporary files, for example, from tests. +/tmp/ + +measurements*.txt diff --git a/one-billion-row/README.md b/one-billion-row/README.md new file mode 100644 index 0000000..45a54fb --- /dev/null +++ b/one-billion-row/README.md @@ -0,0 +1,7 @@ +# One billion row challenge + +Naive solution with explorer. + +```bash +mix obr +``` diff --git a/one-billion-row/lib/obr.ex b/one-billion-row/lib/obr.ex new file mode 100644 index 0000000..aba02f5 --- /dev/null +++ b/one-billion-row/lib/obr.ex @@ -0,0 +1,55 @@ +defmodule Obr do + @moduledoc """ + One billion row challenge. + + https://github.com/gunnarmorling/1brc + """ + + require Explorer.DataFrame, as: DF + + @doc """ + Read the provided file and perform calculations. + """ + def process_file(filepath) do + filepath + |> DF.from_csv!( + header: false, + delimiter: ";", + eol_delimiter: "\n", + dtypes: [column_1: :string, column_2: :float] + ) + |> process_dataframe() + |> format_results() + end + + @doc """ + Process the dataframe and return a list of tuples containing the station name, min, mean and max temperatures". + """ + def process_dataframe(df) do + df + |> DF.group_by("column_1") + |> DF.summarise(min: min(column_2), mean: mean(column_2), max: max(column_2)) + |> DF.sort_by(column_1) + |> DF.select(["column_1", "min", "mean", "max"]) + |> DF.to_rows_stream() + |> Enum.map(fn row -> {row["column_1"], row["min"], row["mean"], row["max"]} end) + end + + def format_results(rows) do + str = + rows + |> Enum.map(&format_row/1) + |> Enum.join(", ") + + "{#{str}}" + end + + def format_row({name, min, mean, max}) do + temperatures = + [min, mean, max] + |> Enum.map(&Float.ceil(&1, 1)) + |> Enum.join("/") + + "#{name}=#{temperatures}" + end +end diff --git a/one-billion-row/lib/task.ex b/one-billion-row/lib/task.ex new file mode 100644 index 0000000..ac3b713 --- /dev/null +++ b/one-billion-row/lib/task.ex @@ -0,0 +1,9 @@ +defmodule Mix.Tasks.Obr do + use Mix.Task + + def run(_) do + {time, res} = :timer.tc(&Obr.process_file/1, ["measurements.txt"], :second) + IO.puts(res) + IO.puts("Time elapsed: #{time} seconds") + end +end diff --git a/one-billion-row/mix.exs b/one-billion-row/mix.exs new file mode 100644 index 0000000..102f1df --- /dev/null +++ b/one-billion-row/mix.exs @@ -0,0 +1,29 @@ +defmodule Obr.MixProject do + use Mix.Project + + def project do + [ + app: :obr, + version: "0.1.0", + elixir: "~> 1.16", + start_permanent: Mix.env() == :prod, + deps: deps() + ] + end + + # Run "mix help compile.app" to learn about applications. + def application do + [ + extra_applications: [:logger] + ] + end + + # Run "mix help deps" to learn about dependencies. + defp deps do + [ + {:explorer, "~> 0.9.0"} + # {:dep_from_hexpm, "~> 0.3.0"}, + # {:dep_from_git, git: "https://github.com/elixir-lang/my_dep.git", tag: "0.1.0"} + ] + end +end diff --git a/one-billion-row/mix.lock b/one-billion-row/mix.lock new file mode 100644 index 0000000..0a2e9bb --- /dev/null +++ b/one-billion-row/mix.lock @@ -0,0 +1,10 @@ +%{ + "aws_signature": {:hex, :aws_signature, "0.3.2", "adf33bc4af00b2089b7708bf20e3246f09c639a905a619b3689f0a0a22c3ef8f", [:rebar3], [], "hexpm", "b0daf61feb4250a8ab0adea60db3e336af732ff71dd3fb22e45ae3dcbd071e44"}, + "castore": {:hex, :castore, "1.0.8", "dedcf20ea746694647f883590b82d9e96014057aff1d44d03ec90f36a5c0dc6e", [:mix], [], "hexpm", "0b2b66d2ee742cb1d9cb8c8be3b43c3a70ee8651f37b75a8b982e036752983f1"}, + "emmap": {:hex, :emmap, "2.0.11", "aec85ae663998329cf4343ef8d3f6b49fc1b054362e6d2689ef3314f7f36d351", [:rebar3], [], "hexpm", "10789b911658c672c62a979f34eb8d1fc3f8237a367f7d5dc3dbb3e1ebefd1c4"}, + "explorer": {:hex, :explorer, "0.9.2", "a9598eeff8d36d88f643d14818bea1869ca70c4def61bfba22f040ee315b84b6", [:mix], [{:adbc, "~> 0.1", [hex: :adbc, repo: "hexpm", optional: true]}, {:aws_signature, "~> 0.3", [hex: :aws_signature, repo: "hexpm", optional: false]}, {:castore, "~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:flame, "~> 0.3", [hex: :flame, repo: "hexpm", optional: true]}, {:fss, "~> 0.1", [hex: :fss, repo: "hexpm", optional: false]}, {:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: true]}, {:rustler, "~> 0.34.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.7", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}, {:table, "~> 0.1.2", [hex: :table, repo: "hexpm", optional: false]}, {:table_rex, "~> 3.1.1 or ~> 4.0.0", [hex: :table_rex, repo: "hexpm", optional: false]}], "hexpm", "63057e318d613c1819bd8bee2d8ed4f7061c3136edc6832ad18243d28e6344eb"}, + "fss": {:hex, :fss, "0.1.1", "9db2344dbbb5d555ce442ac7c2f82dd975b605b50d169314a20f08ed21e08642", [:mix], [], "hexpm", "78ad5955c7919c3764065b21144913df7515d52e228c09427a004afe9c1a16b0"}, + "rustler_precompiled": {:hex, :rustler_precompiled, "0.8.0", "02d218b575d8175e80138557f46bee7af5598f29e9aff8935a6c369c0e6c47a5", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "00b1711d8d828200fe931e23bb0e72c2672a3a0ef76740e3c50433afda1965fb"}, + "table": {:hex, :table, "0.1.2", "87ad1125f5b70c5dea0307aa633194083eb5182ec537efc94e96af08937e14a8", [:mix], [], "hexpm", "7e99bc7efef806315c7e65640724bf165c3061cdc5d854060f74468367065029"}, + "table_rex": {:hex, :table_rex, "4.0.0", "3c613a68ebdc6d4d1e731bc973c233500974ec3993c99fcdabb210407b90959b", [:mix], [], "hexpm", "c35c4d5612ca49ebb0344ea10387da4d2afe278387d4019e4d8111e815df8f55"}, +} diff --git a/one-billion-row/test/obr_test.exs b/one-billion-row/test/obr_test.exs new file mode 100644 index 0000000..4f446c8 --- /dev/null +++ b/one-billion-row/test/obr_test.exs @@ -0,0 +1,16 @@ +defmodule ObrTest do + use ExUnit.Case + doctest Obr + + test "dataframe processing is correct" do + df = Explorer.DataFrame.new(column_1: ["c", "b", "c", "c"], column_2: [-6.0, 3.0, 1.2, 2.0]) + res = Obr.process_dataframe(df) + assert res == [{"b", 3.0, 3.0, 3.0}, {"c", -6.0, -2.8 / 3, 2.0}] + end + + test "formatting is correct" do + src = [{"b", 3.0, 2.91, 4.0}, {"c", -6.0, -2.8 / 3, 2.0}] + str = Obr.format_results(src) + assert str == "{b=3.0/3.0/4.0, c=-6.0/-0.9/2.0}" + end +end diff --git a/one-billion-row/test/test_helper.exs b/one-billion-row/test/test_helper.exs new file mode 100644 index 0000000..869559e --- /dev/null +++ b/one-billion-row/test/test_helper.exs @@ -0,0 +1 @@ +ExUnit.start()