From 6a61154bdad9315cc8fede127f74e8dd240dd4db Mon Sep 17 00:00:00 2001 From: stuebinm Date: Sun, 9 Jul 2023 02:19:08 +0200 Subject: [PATCH] separate domains, fuzzy matching all-round --- src/bahnhofname.gleam | 242 ++++++++++++++++++++++++------------------ 1 file changed, 138 insertions(+), 104 deletions(-) diff --git a/src/bahnhofname.gleam b/src/bahnhofname.gleam index 3b9bbc3..27167a9 100644 --- a/src/bahnhofname.gleam +++ b/src/bahnhofname.gleam @@ -1,5 +1,5 @@ import gleam/http/response.{Response} -import gleam/http/request.{Request} +import gleam/http/request.{Request,get_header} import gleam/http.{Get} import gleam/bit_builder.{BitBuilder} import gleam/erlang/process @@ -13,8 +13,15 @@ import gleam/list import gleam/map.{Map} import gleam/uri import gleam/hackney +import gleam/pair.{swap} import mist +const ds100_domain = "ds100.bahnhof.name" +const ril100_domain = "ril100.bahnhof.name" +const leitpunkt_domain = "leitpunkt.bahnhof.name" +const domain = "bahnhof.name" +const proto = "https://" + external type Index external type Field @@ -36,9 +43,6 @@ external fn field_new(String) -> Field = external fn index_add(Index, List(a)) -> Index = "Elixir.Haystack.Index" "add" -external fn index_search(Index, String) -> List(Map(atom.Atom, String)) = - "Elixir.Haystack.Index" "search" - pub external fn inspect(a) -> a = "Elixir.IO" "inspect" @@ -61,6 +65,17 @@ external fn query_expression_new(atom.Atom, List(#(atom.Atom, String))) -> Expre external fn tokenize(String) -> List(Map(atom.Atom, String)) = "Elixir.Haystack.Tokenizer" "tokenize" +type IdKind { + DS100 + Leitpunkt +} + +type Matched(t) { + Exact(t) + Fuzzy(t) + Failed +} + fn unpercent(encoded: String) -> String { let #([head], chunks) = encoded @@ -96,43 +111,30 @@ fn lookup_exact(query: String, lookup: Map(String, String)) -> #(Int, String) { } /// Looks up a station by its name, with fuzzy matching. -fn lookup_by_name( +fn lookup_fuzzy( query: String, - stations: Map(String, String), - ds100s: Map(String, String), - fuzzy: fn(String) -> List(String) + kind: IdKind, + fuzzy: fn(String, IdKind) -> Matched(String) ) -> #(Int, String) { - io.println(query) - - case map.get(stations, query) { - Ok(ds100) -> #(200, ds100) - _ -> { - let results = fuzzy(query) - |> list.filter_map(fn (res) { map.get(ds100s, string.uppercase(res)) }) - case results { - // results -> { - // let names = results - // |> list.map (fn (res) { - // map.get(ds100s, string.uppercase(res)) - // |> result.map(fn(a) { "/" <> a }) - // |> result.unwrap("/")}) - // #(200, string.join(names, "\n")) - // } - [res] -> #(302, res) - [res, ..] -> #(302, res) - _ -> #(404, "??") - } - } + case fuzzy(query, kind) { + Exact(res) -> #(200, res) + Fuzzy(res) -> #(302, res) + Failed -> #(404, "??") } } +fn if_not(res: #(Int,t), fallback: fn() -> #(Int,t)) -> #(Int,t) { + inspect(case res { + #(200, _) -> res + _ -> fallback() + }) +} + fn lookup_station( request: Request(t), - stations: Map(String, String), - ds100s: Map(String, String), - leitpunkte: Map(String, String), - baseurl: String, - fuzzy: fn (String) -> List(String) + ds100_to_name: Map(String, String), + leitpunkt_to_name: Map(String, String), + fuzzy: fn (String, IdKind) -> Matched(String) ) -> Response(BitBuilder) { let #(code, text) = case request { // blackhole favicon.ico requests instead of using the index @@ -140,35 +142,33 @@ fn lookup_station( Request(method: Get, path: "/help", ..) | Request(method: Get, path: "/", ..) -> #( 200, - "ds100 → Name: " <> baseurl <> "/NN\n" <> "Name → ds100: " <> baseurl <> "/Nürnberg Hbf", + "ril100 → Name: " <> proto<>ril100_domain<>"/HG\n" <> + "Name → ril100: " <> proto<>ril100_domain <> "/Göttingen\n\n" <> + "Leitpunkt → Name: " <> proto<>leitpunkt_domain<>"/GOE\n" <> + "Name → Leitpunkt: " <> proto<>leitpunkt_domain <> "/Göttingen\n\n"<> + "Fuzzy:" <> proto<>domain<>"/...", ) - Request(method: Get, path: "/ds100/" <> path, ..) -> - path - |> unpercent - |> string.uppercase - |> lookup_exact(ds100s) - Request(method: Get, path: "/name/" <> path, ..) -> - path - |> unpercent - |> lookup_by_name(stations, ds100s, fuzzy) - Request(method: Get, path: "/leitpunkt/" <> path, ..) -> - path - |> unpercent - |> string.uppercase - |> lookup_exact(leitpunkte) Request(method: Get, path: "/" <> path, ..) -> { - let path = unpercent(path) - - let by_ds100 = lookup_exact(path, ds100s) - let by_lp = lookup_exact(path, leitpunkte) - - case #(by_ds100.0, by_lp.0) { - #(200, _) -> by_ds100 - #(_, 200) -> by_lp - _ -> lookup_by_name(path, stations, ds100s, fuzzy) + let query = unpercent(path) + case get_header(request, "x-forwarded-host") { + Ok(domain) if domain == leitpunkt_domain -> query + |> lookup_exact(leitpunkt_to_name) + |> if_not(fn() {lookup_fuzzy(query,Leitpunkt,fuzzy)}) + Ok(domain) if domain == ril100_domain || domain == ds100_domain -> query + |> lookup_exact(ds100_to_name) + |> if_not(fn() {lookup_fuzzy(query,DS100, fuzzy)}) + _ -> { + let by_ds100 = lookup_exact(query, ds100_to_name) + let by_lp = lookup_exact(query, leitpunkt_to_name) + case #(by_ds100.0, by_lp.0) { + #(200, _) -> #(302, proto<>ril100_domain<>"/"<>path) + #(_, 200) -> #(302, proto<>leitpunkt_domain<>"/"<>path) + _ -> #(302, proto<>ril100_domain<>"/"<>path) + } + } } } - _ -> #(404, "intended usage is e.g. curl " <> baseurl <> "/FF") + _ -> #(404, "intended usage is e.g. curl " <> proto<>domain<>"/FF") } let body = bit_builder.from_string(text) @@ -189,67 +189,54 @@ fn lookup_station( |> response.set_body(body) } -fn fetch_data() -> Result(String, hackney.Error) { - let assert Ok(uri) = - uri.parse( - "https://download-data.deutschebahn.com/static/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2021-10.csv", - ) - let assert Ok(request) = request.from_uri(uri) - let assert Ok(response) = hackney.send(request) - - // some ü are corrupted for some reason - Ok(string.replace(response.body, "�", "ü")) -} - -fn read_csv(contents) -> List(List(String)) { - contents - // the file doesn't use quotes, so this is fine - |> string.split(on: "\n") - // drop CSV header - |> list.drop(1) - |> list.map(fn(a) { string.split(a, on: ";") }) -} - pub fn main() { let assert Ok(bahn_ril100) = fetch_data() - let baseurl = "https://bahnhof.name" - let stations = read_csv(bahn_ril100) + + let ds100s = read_csv(bahn_ril100) |> list.filter_map(fn(fields) { case fields { [_, ds100, name, ..] -> Ok(#(name, ds100)) _ -> Error(fields) } }) - let stationmap = - stations - |> map.from_list - let ds100map = - stations - |> list.map(fn(a) { #(a.1, a.0) }) - |> map.from_list - let ref = atom.create_from_string("ref") - let index = index_new(atom.create_from_string("stations")) - |> index_ref(field_term("id")) - |> index_field(field_new("name")) - |> index_add(stations - |> list.map(fn(tuple) {case tuple { - #(name, ds100) - -> map.from_list([#("id", ds100), #("name", name)] - )}})) - let assert Ok(leitpunkte_raw) = file.read("data/leitpunkte.csv") let leitpunkte = read_csv(leitpunkte_raw) |> list.filter_map(fn(fields) { case fields { - [lp, _, ds100] -> Ok(#(lp, ds100)) + [lp, name, _ds100] -> Ok(#(name, lp)) _ -> Error(fields) } }) - |> map.from_list - let fuzzy = fn(searchterm: String) -> List(String) { + let name_to_ds100 = map.from_list(ds100s) + let name_to_leitpunkt = map.from_list(leitpunkte) + let ds100_to_name = map.from_list(list.map(ds100s, swap)) + let leitpunkt_to_name = map.from_list(list.map(leitpunkte, swap)) + let ds100index = index_new(atom.create_from_string("ds100")) + |> index_ref(field_term("id")) + |> index_field(field_new("name")) + |> index_add(ds100s + |> list.map(fn(tuple) {case tuple { + #(name, ds100) + -> map.from_list([#("id", ds100), #("name", name)] + )}})) + let leitpunkt_index = index_new(atom.create_from_string("leitpunkt")) + |> index_ref(field_term("id")) + |> index_field(field_new("name")) + |> index_add(leitpunkte + |> list.map(fn(tuple) {case tuple { + #(name, leitpunkt) + -> map.from_list([#("id", leitpunkt), #("name", name)] + )}})) + + let ref = atom.create_from_string("ref") + let fuzzy = fn(searchterm: String, kind: IdKind) -> List(String) { let query = query_new() + let index = case kind { + DS100 -> ds100index + Leitpunkt -> leitpunkt_index + } let match = atom.create_from_string("match") let field = atom.create_from_string("field") let term = atom.create_from_string("term") @@ -282,12 +269,59 @@ pub fn main() { } } - io.println("compiled index, starting server …") + let exact_then_fuzzy = fn(searchterm: String, kind: IdKind) -> Matched(String) { + let #(stations, ids) = case kind { + DS100 -> #(name_to_ds100, ds100_to_name) + Leitpunkt -> #(name_to_leitpunkt, leitpunkt_to_name) + } + case map.get(stations, searchterm) { + Ok(id) -> Exact(id) + _ -> { + let results = fuzzy(searchterm, kind) + |> list.filter_map(fn (res) { + map.get(ids, string.uppercase(res)) + }) + case results { + [res] -> Fuzzy(res) + [res, ..] -> Fuzzy(res) + _ -> Failed + } + } + } + } + + io.println("compiled indices, starting server …") let _ = mist.run_service( 2345, - fn(req) { lookup_station(req, stationmap, ds100map, leitpunkte, baseurl, fuzzy) }, + fn(req) { lookup_station( + req, + ds100_to_name, + leitpunkt_to_name, + exact_then_fuzzy + ) }, max_body_limit: 100, ) process.sleep_forever() } + +fn fetch_data() -> Result(String, hackney.Error) { + let assert Ok(uri) = + uri.parse( + "https://download-data.deutschebahn.com/static/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2021-10.csv", + ) + let assert Ok(request) = request.from_uri(uri) + let assert Ok(response) = hackney.send(request) + + // some ü are corrupted for some reason + Ok(string.replace(response.body, "�", "ü")) +} + +fn read_csv(contents) -> List(List(String)) { + contents + // the file doesn't use quotes, so this is fine + |> string.split(on: "\n") + // drop CSV header + |> list.drop(1) + |> list.map(fn(a) { string.split(a, on: ";") }) +}