From db71361481463fbe5109fc9b5a57fd1083018253 Mon Sep 17 00:00:00 2001 From: stuebinm Date: Thu, 11 May 2023 14:29:10 +0200 Subject: [PATCH] use haystack for search for some reason it doesn't find Karlsruhe, but basically everything else?? --- gleam.toml | 1 + manifest.toml | 11 ++- src/bahnhofname.gleam | 209 +++++++++++++++++++++++++----------------- 3 files changed, 136 insertions(+), 85 deletions(-) diff --git a/gleam.toml b/gleam.toml index 63ee397..22c7545 100644 --- a/gleam.toml +++ b/gleam.toml @@ -14,6 +14,7 @@ gleam_stdlib = "~> 0.19" gleam_http = "~> 3.0" mist = "~> 0.4" gleam_hackney = "~> 1.0" +haystack = "~> 0.1" [dev-dependencies] gleeunit = "~> 0.6" diff --git a/manifest.toml b/manifest.toml index 9627978..e5b8c42 100644 --- a/manifest.toml +++ b/manifest.toml @@ -3,20 +3,24 @@ packages = [ { name = "certifi", version = "2.9.0", build_tools = ["rebar3"], requirements = [], otp_app = "certifi", source = "hex", outer_checksum = "266DA46BDB06D6C6D35FDE799BCB28D36D985D424AD7C08B5BB48F5B5CDD4641" }, + { name = "decimal", version = "2.1.1", build_tools = ["mix"], requirements = [], otp_app = "decimal", source = "hex", outer_checksum = "53CFE5F497ED0E7771AE1A475575603D77425099BA5FAEF9394932B35020FFCC" }, { name = "gleam_erlang", version = "0.18.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_erlang", source = "hex", outer_checksum = "C69F59D086AD50B80DE294FB0963550630971C9DC04E92B1F7AEEDD2C0BE226C" }, { name = "gleam_hackney", version = "1.0.0", build_tools = ["gleam"], requirements = ["gleam_http", "gleam_stdlib", "hackney"], otp_app = "gleam_hackney", source = "hex", outer_checksum = "B3C1E6BD138D57252F9F9E499C741E9227EE7EE9B017CA650EC8193E02F734E1" }, { name = "gleam_http", version = "3.2.0", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleam_http", source = "hex", outer_checksum = "D034F5CE0639CD142CBA210B7D5D14236C284B0C5772A043D2E22128594573AE" }, { name = "gleam_otp", version = "0.5.3", build_tools = ["gleam"], requirements = ["gleam_erlang", "gleam_stdlib"], otp_app = "gleam_otp", source = "hex", outer_checksum = "6E705B69464237353E0380AC8143BDB29A3F0BF6168755D5F2D6E55A34A8B077" }, { name = "gleam_stdlib", version = "0.28.1", build_tools = ["gleam"], requirements = [], otp_app = "gleam_stdlib", source = "hex", outer_checksum = "73F0A89FADE5022CBEF6D6C3551F9ADCE7054AFCE0CB1DC4C6D5AB4CA62D0111" }, { name = "gleeunit", version = "0.10.1", build_tools = ["gleam"], requirements = ["gleam_stdlib"], otp_app = "gleeunit", source = "hex", outer_checksum = "ECEA2DE4BE6528D36AFE74F42A21CDF99966EC36D7F25DEB34D47DD0F7977BAF" }, - { name = "glisten", version = "0.7.0", build_tools = ["gleam"], requirements = ["gleam_erlang", "gleam_stdlib", "gleam_otp"], otp_app = "glisten", source = "hex", outer_checksum = "52B530FF25370590843998D1B6C4EC6169DB1300D5E4407A5CDA1575374B7AEC" }, - { name = "hackney", version = "1.18.1", build_tools = ["rebar3"], requirements = ["certifi", "metrics", "mimerl", "ssl_verify_fun", "idna", "parse_trans", "unicode_util_compat"], otp_app = "hackney", source = "hex", outer_checksum = "A4ECDAFF44297E9B5894AE499E9A070EA1888C84AFDD1FD9B7B2BC384950128E" }, + { name = "glisten", version = "0.7.0", build_tools = ["gleam"], requirements = ["gleam_erlang", "gleam_otp", "gleam_stdlib"], otp_app = "glisten", source = "hex", outer_checksum = "52B530FF25370590843998D1B6C4EC6169DB1300D5E4407A5CDA1575374B7AEC" }, + { name = "hackney", version = "1.18.1", build_tools = ["rebar3"], requirements = ["idna", "metrics", "parse_trans", "ssl_verify_fun", "mimerl", "certifi", "unicode_util_compat"], otp_app = "hackney", source = "hex", outer_checksum = "A4ECDAFF44297E9B5894AE499E9A070EA1888C84AFDD1FD9B7B2BC384950128E" }, + { name = "haystack", version = "0.1.0", build_tools = ["mix"], requirements = ["jason", "stemmer"], otp_app = "haystack", source = "hex", outer_checksum = "27A582513EF933C1B11345B96F8D41EE137D03B25312BD85068FFE8FEC503635" }, { name = "idna", version = "6.1.1", build_tools = ["rebar3"], requirements = ["unicode_util_compat"], otp_app = "idna", source = "hex", outer_checksum = "92376EB7894412ED19AC475E4A86F7B413C1B9FBB5BD16DCCD57934157944CEA" }, + { name = "jason", version = "1.4.0", build_tools = ["mix"], requirements = ["decimal"], otp_app = "jason", source = "hex", outer_checksum = "79A3791085B2A0F743CA04CEC0F7BE26443738779D09302E01318F97BDB82121" }, { name = "metrics", version = "1.0.1", build_tools = ["rebar3"], requirements = [], otp_app = "metrics", source = "hex", outer_checksum = "69B09ADDDC4F74A40716AE54D140F93BEB0FB8978D8636EADED0C31B6F099F16" }, { name = "mimerl", version = "1.2.0", build_tools = ["rebar3"], requirements = [], otp_app = "mimerl", source = "hex", outer_checksum = "F278585650AA581986264638EBF698F8BB19DF297F66AD91B18910DFC6E19323" }, - { name = "mist", version = "0.10.0", build_tools = ["gleam"], requirements = ["gleam_erlang", "gleam_stdlib", "gleam_otp", "glisten", "gleam_http"], otp_app = "mist", source = "hex", outer_checksum = "5AFBABABF738BAB8720F047471051E4E9D102CA4694C120DB899FA12AD5D180B" }, + { name = "mist", version = "0.10.0", build_tools = ["gleam"], requirements = ["gleam_stdlib", "gleam_otp", "glisten", "gleam_http", "gleam_erlang"], otp_app = "mist", source = "hex", outer_checksum = "5AFBABABF738BAB8720F047471051E4E9D102CA4694C120DB899FA12AD5D180B" }, { name = "parse_trans", version = "3.3.1", build_tools = ["rebar3"], requirements = [], otp_app = "parse_trans", source = "hex", outer_checksum = "07CD9577885F56362D414E8C4C4E6BDF10D43A8767ABB92D24CBE8B24C54888B" }, { name = "ssl_verify_fun", version = "1.1.6", build_tools = ["mix", "rebar3", "make"], requirements = [], otp_app = "ssl_verify_fun", source = "hex", outer_checksum = "BDB0D2471F453C88FF3908E7686F86F9BE327D065CC1EC16FA4540197EA04680" }, + { name = "stemmer", version = "1.1.0", build_tools = ["mix"], requirements = [], otp_app = "stemmer", source = "hex", outer_checksum = "0CB5FAF73476B84500E371FF39FD9A494F60AB31D991689C1CD53B920556228F" }, { name = "unicode_util_compat", version = "0.7.0", build_tools = ["rebar3"], requirements = [], otp_app = "unicode_util_compat", source = "hex", outer_checksum = "25EEE6D67DF61960CF6A794239566599B09E17E668D3700247BC498638152521" }, ] @@ -25,4 +29,5 @@ gleam_hackney = "~> 1.0" gleam_http = "~> 3.0" gleam_stdlib = "~> 0.19" gleeunit = "~> 0.6" +haystack = "~> 0.1" mist = "~> 0.4" diff --git a/src/bahnhofname.gleam b/src/bahnhofname.gleam index 36ea035..a783955 100644 --- a/src/bahnhofname.gleam +++ b/src/bahnhofname.gleam @@ -3,70 +3,62 @@ import gleam/http/request.{Request} import gleam/http.{Get} import gleam/bit_builder.{BitBuilder} import gleam/erlang/process +import gleam/erlang/atom import gleam/io import gleam/int import gleam/string import gleam/bit_string import gleam/list import gleam/map.{Map} -import gleam/result.{lazy_unwrap} import gleam/uri import gleam/hackney -import gleam/option.{None, Some} import mist -fn do_distlist( - b: String, - distlist: List(Int), - grapheme: String, - new_distlist: List(Int), - last_dist: Int, -) { - case #(b, distlist) { - #("", _) -> list.reverse(new_distlist) - #(_, [distlist_hd, distlist_snd, ..distlist_tl]) -> { - let assert Ok(b_hd) = string.first(b) - let b_tl = string.drop_left(b, up_to: 1) - let diff = case #(b_hd, grapheme) { - #(a, b) if a != b -> 1 - _ -> 0 - } - let minimum = - int.min(int.min(last_dist + 1, distlist_snd + 1), distlist_hd + diff) - do_distlist( - b_tl, - [distlist_snd, ..distlist_tl], - grapheme, - [minimum, ..new_distlist], - minimum, - ) - } - } -} +external type Index +external type Field -fn do_distance(a: String, b: String, distlist: List(Int), step: Int) { - case a { - "" -> result.unwrap(list.last(distlist), -1) - _ -> { - let assert Ok(src_hd) = string.first(a) - let src_tl = string.drop_left(a, up_to: 1) - let distlist = do_distlist(b, distlist, src_hd, [step], step) - do_distance(src_tl, b, distlist, step + 1) - } - } -} +external fn index_new(atom.Atom) -> Index = + "Elixir.Haystack.Index" "new" -fn levenshtein(a: String, b: String) -> Int { - case #(a, b) { - #(a, b) if a == b -> 0 - #("", b) -> string.length(b) - #(a, "") -> string.length(a) - #(a, b) -> { - let distlist = list.range(0, string.length(b)) - do_distance(a, b, distlist, 1) - } - } -} +external fn index_ref(Index, Field) -> Index = + "Elixir.Haystack.Index" "ref" + +external fn index_field(Index, Field) -> Index = + "Elixir.Haystack.Index" "field" + +external fn field_term(String) -> Field = + "Elixir.Haystack.Index.Field" "term" + +external fn field_new(String) -> Field = + "Elixir.Haystack.Index.Field" "new" + +external fn index_add(Index, List(a)) -> Index = + "Elixir.Haystack.Index" "add" + +external fn index_search(Index, String) -> List(Map(atom.Atom, String)) = + "Elixir.Haystack.Index" "search" + +pub external fn inspect(a) -> a = + "Elixir.IO" "inspect" + +external type Query +external type Clause +external type Expression +external fn query_new() -> Query = + "Elixir.Haystack.Query" "new" +external fn query_clause(Query, Clause) -> Query = + "Elixir.Haystack.Query" "clause" +external fn query_run(Query, Index) -> List(Map(atom.Atom, String)) = + "Elixir.Haystack.Query" "run" +external fn clause_new(atom.Atom) -> Clause = + "Elixir.Haystack.Query.Clause" "new" +external fn query_expressions(Clause, List(Expression)) -> Clause = + "Elixir.Haystack.Query.Clause" "expressions" +external fn query_expression_new(atom.Atom, List(#(atom.Atom, String))) -> Expression = + "Elixir.Haystack.Query.Expression" "new" + +external fn tokenize(String) -> List(Map(atom.Atom, String)) = + "Elixir.Haystack.Tokenizer" "tokenize" fn unpercent(encoded: String) -> String { let #([head], chunks) = @@ -94,34 +86,38 @@ fn unpercent(encoded: String) -> String { res } -fn guess_station(query: String, stations: Map(String, String)) -> String { - query - stations - |> map.keys - |> list.map(fn(a) { #(levenshtein(query, a), a) }) - |> list.fold( - from: #(string.length(query), query), - with: fn(a, b) { - case a.0 < b.0 { - True -> a - False -> b - } - }, - ) - |> fn(a: #(Int, String)) { a.1 } -} - fn the_lookup( query: String, stations: Map(String, String), ds100s: Map(String, String), -) -> String { - map.get(ds100s, query) - |> lazy_unwrap(fn() { - io.println(query) - map.get(stations, query) - |> lazy_unwrap(fn() { guess_station(query, stations) }) - }) + fuzzy: fn(String) -> List(String) +) -> #(Int, String) { + case map.get(ds100s, query) { + Ok(name) -> #(200, name) + _ -> { + io.println(query) + case map.get(stations, query) { + Ok(ds100) -> #(200, ds100) + _ -> { + let results = fuzzy(query) + |> list.filter_map(fn (res) { map.get(ds100s, string.uppercase(res)) }) + case results { + // results -> { + // let names = results + // |> list.map (fn (res) { + // map.get(ds100s, string.uppercase(res)) + // |> result.map(fn(a) { "/" <> a }) + // |> result.unwrap("/")}) + // #(200, string.join(names, "\n")) + // } + [res] -> #(302, res) + [res, ..] -> #(302, res) + _ -> #(404, "??") + } + } + } + } + } } fn lookup_station( @@ -129,6 +125,7 @@ fn lookup_station( stations: Map(String, String), ds100s: Map(String, String), baseurl: String, + fuzzy: fn (String) -> List(String) ) -> Response(BitBuilder) { let #(code, text) = case request { Request(method: Get, path: "/help", ..) @@ -136,10 +133,8 @@ fn lookup_station( 200, "ds100 → Name: " <> baseurl <> "/NN\n" <> "Name → ds100: " <> baseurl <> "/Nürnberg Hbf", ) - Request(method: Get, path: "/" <> path, ..) -> #( - 200, - the_lookup(unpercent(path), stations, ds100s), - ) + Request(method: Get, path: "/" <> path, ..) -> + the_lookup(unpercent(path), stations, ds100s, fuzzy) _ -> #(404, "intended usage is e.g. curl " <> baseurl <> "/FF") } let body = bit_builder.from_string(text) @@ -154,6 +149,10 @@ fn lookup_station( "https://stuebinm.eu/git/bahnhof.name", ) |> response.prepend_header("content-type", "text/plain; charset=utf8") + |> fn (a) { case code == 302 { + True -> response.prepend_header(a, "location", text) + _ -> a + } } |> response.set_body(body) } @@ -193,10 +192,56 @@ pub fn main() { stations |> list.map(fn(a) { #(a.1, a.0) }) |> map.from_list + let ref = atom.create_from_string("ref") + let index = index_new(atom.create_from_string("stations")) + |> index_ref(field_term("id")) + |> index_field(field_new("name")) + |> index_add(stations + |> list.map(fn(tuple) {case tuple { + #(name, ds100) + -> map.from_list([#("id", ds100), #("name", name)] + )}})) - mist.run_service( + + let fuzzy = fn(searchterm: String) -> List(String) { + let query = query_new() + let match = atom.create_from_string("match") + let field = atom.create_from_string("field") + let term = atom.create_from_string("term") + let expressions = tokenize(inspect(searchterm)) + |> list.filter_map(fn (a) { map.get(a, atom.create_from_string("v")) }) + |> list.map(fn (token) { query_expression_new(match, [#(field, "name"), #(term, token)]) }) + let clause = query_expressions(clause_new(atom.create_from_string("all")), expressions) + let query = query_clause(query, clause) + + let matches = query_run(query, index) + |> list.filter_map(fn (a) { map.get(a, ref) }) + + inspect(matches) + case list.length(matches) > 5 { + True -> { + let query = query_new() + let clause = query_expressions( + clause_new(atom.create_from_string("all")), + [query_expression_new(match, [#(field, "name"), #(term, "hbf")]) , ..expressions] + ) + let query = query_clause(query, clause) + let narrow = query_run(query, index) + |> list.filter_map(fn (a) { map.get(a, ref) }) + case narrow { + [] -> matches + _ -> narrow + } + } + _ -> matches + } + } + + io.println("compiled index, starting server …") + + let _ = mist.run_service( 2345, - fn(req) { lookup_station(req, stationmap, ds100map, baseurl) }, + fn(req) { lookup_station(req, stationmap, ds100map, baseurl, fuzzy) }, max_body_limit: 100, ) process.sleep_forever()