separate domains, fuzzy matching all-round

This commit is contained in:
stuebinm 2023-07-09 02:19:08 +02:00
parent 128198f070
commit 6a61154bda

View file

@ -1,5 +1,5 @@
import gleam/http/response.{Response} import gleam/http/response.{Response}
import gleam/http/request.{Request} import gleam/http/request.{Request,get_header}
import gleam/http.{Get} import gleam/http.{Get}
import gleam/bit_builder.{BitBuilder} import gleam/bit_builder.{BitBuilder}
import gleam/erlang/process import gleam/erlang/process
@ -13,8 +13,15 @@ import gleam/list
import gleam/map.{Map} import gleam/map.{Map}
import gleam/uri import gleam/uri
import gleam/hackney import gleam/hackney
import gleam/pair.{swap}
import mist import mist
const ds100_domain = "ds100.bahnhof.name"
const ril100_domain = "ril100.bahnhof.name"
const leitpunkt_domain = "leitpunkt.bahnhof.name"
const domain = "bahnhof.name"
const proto = "https://"
external type Index external type Index
external type Field external type Field
@ -36,9 +43,6 @@ external fn field_new(String) -> Field =
external fn index_add(Index, List(a)) -> Index = external fn index_add(Index, List(a)) -> Index =
"Elixir.Haystack.Index" "add" "Elixir.Haystack.Index" "add"
external fn index_search(Index, String) -> List(Map(atom.Atom, String)) =
"Elixir.Haystack.Index" "search"
pub external fn inspect(a) -> a = pub external fn inspect(a) -> a =
"Elixir.IO" "inspect" "Elixir.IO" "inspect"
@ -61,6 +65,17 @@ external fn query_expression_new(atom.Atom, List(#(atom.Atom, String))) -> Expre
external fn tokenize(String) -> List(Map(atom.Atom, String)) = external fn tokenize(String) -> List(Map(atom.Atom, String)) =
"Elixir.Haystack.Tokenizer" "tokenize" "Elixir.Haystack.Tokenizer" "tokenize"
type IdKind {
DS100
Leitpunkt
}
type Matched(t) {
Exact(t)
Fuzzy(t)
Failed
}
fn unpercent(encoded: String) -> String { fn unpercent(encoded: String) -> String {
let #([head], chunks) = let #([head], chunks) =
encoded encoded
@ -96,43 +111,30 @@ fn lookup_exact(query: String, lookup: Map(String, String)) -> #(Int, String) {
} }
/// Looks up a station by its name, with fuzzy matching. /// Looks up a station by its name, with fuzzy matching.
fn lookup_by_name( fn lookup_fuzzy(
query: String, query: String,
stations: Map(String, String), kind: IdKind,
ds100s: Map(String, String), fuzzy: fn(String, IdKind) -> Matched(String)
fuzzy: fn(String) -> List(String)
) -> #(Int, String) { ) -> #(Int, String) {
io.println(query) case fuzzy(query, kind) {
Exact(res) -> #(200, res)
case map.get(stations, query) { Fuzzy(res) -> #(302, res)
Ok(ds100) -> #(200, ds100) Failed -> #(404, "??")
_ -> {
let results = fuzzy(query)
|> list.filter_map(fn (res) { map.get(ds100s, string.uppercase(res)) })
case results {
// results -> {
// let names = results
// |> list.map (fn (res) {
// map.get(ds100s, string.uppercase(res))
// |> result.map(fn(a) { "/" <> a })
// |> result.unwrap("/")})
// #(200, string.join(names, "\n"))
// }
[res] -> #(302, res)
[res, ..] -> #(302, res)
_ -> #(404, "??")
}
}
} }
} }
fn if_not(res: #(Int,t), fallback: fn() -> #(Int,t)) -> #(Int,t) {
inspect(case res {
#(200, _) -> res
_ -> fallback()
})
}
fn lookup_station( fn lookup_station(
request: Request(t), request: Request(t),
stations: Map(String, String), ds100_to_name: Map(String, String),
ds100s: Map(String, String), leitpunkt_to_name: Map(String, String),
leitpunkte: Map(String, String), fuzzy: fn (String, IdKind) -> Matched(String)
baseurl: String,
fuzzy: fn (String) -> List(String)
) -> Response(BitBuilder) { ) -> Response(BitBuilder) {
let #(code, text) = case request { let #(code, text) = case request {
// blackhole favicon.ico requests instead of using the index // blackhole favicon.ico requests instead of using the index
@ -140,35 +142,33 @@ fn lookup_station(
Request(method: Get, path: "/help", ..) Request(method: Get, path: "/help", ..)
| Request(method: Get, path: "/", ..) -> #( | Request(method: Get, path: "/", ..) -> #(
200, 200,
"ds100 → Name: " <> baseurl <> "/NN\n" <> "Name → ds100: " <> baseurl <> "/Nürnberg Hbf", "ril100 → Name: " <> proto<>ril100_domain<>"/HG\n" <>
"Name → ril100: " <> proto<>ril100_domain <> "/Göttingen\n\n" <>
"Leitpunkt → Name: " <> proto<>leitpunkt_domain<>"/GOE\n" <>
"Name → Leitpunkt: " <> proto<>leitpunkt_domain <> "/Göttingen\n\n"<>
"Fuzzy:" <> proto<>domain<>"/...",
) )
Request(method: Get, path: "/ds100/" <> path, ..) ->
path
|> unpercent
|> string.uppercase
|> lookup_exact(ds100s)
Request(method: Get, path: "/name/" <> path, ..) ->
path
|> unpercent
|> lookup_by_name(stations, ds100s, fuzzy)
Request(method: Get, path: "/leitpunkt/" <> path, ..) ->
path
|> unpercent
|> string.uppercase
|> lookup_exact(leitpunkte)
Request(method: Get, path: "/" <> path, ..) -> { Request(method: Get, path: "/" <> path, ..) -> {
let path = unpercent(path) let query = unpercent(path)
case get_header(request, "x-forwarded-host") {
let by_ds100 = lookup_exact(path, ds100s) Ok(domain) if domain == leitpunkt_domain -> query
let by_lp = lookup_exact(path, leitpunkte) |> lookup_exact(leitpunkt_to_name)
|> if_not(fn() {lookup_fuzzy(query,Leitpunkt,fuzzy)})
case #(by_ds100.0, by_lp.0) { Ok(domain) if domain == ril100_domain || domain == ds100_domain -> query
#(200, _) -> by_ds100 |> lookup_exact(ds100_to_name)
#(_, 200) -> by_lp |> if_not(fn() {lookup_fuzzy(query,DS100, fuzzy)})
_ -> lookup_by_name(path, stations, ds100s, fuzzy) _ -> {
let by_ds100 = lookup_exact(query, ds100_to_name)
let by_lp = lookup_exact(query, leitpunkt_to_name)
case #(by_ds100.0, by_lp.0) {
#(200, _) -> #(302, proto<>ril100_domain<>"/"<>path)
#(_, 200) -> #(302, proto<>leitpunkt_domain<>"/"<>path)
_ -> #(302, proto<>ril100_domain<>"/"<>path)
}
}
} }
} }
_ -> #(404, "intended usage is e.g. curl " <> baseurl <> "/FF") _ -> #(404, "intended usage is e.g. curl " <> proto<>domain<>"/FF")
} }
let body = bit_builder.from_string(text) let body = bit_builder.from_string(text)
@ -189,67 +189,54 @@ fn lookup_station(
|> response.set_body(body) |> response.set_body(body)
} }
fn fetch_data() -> Result(String, hackney.Error) {
let assert Ok(uri) =
uri.parse(
"https://download-data.deutschebahn.com/static/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2021-10.csv",
)
let assert Ok(request) = request.from_uri(uri)
let assert Ok(response) = hackney.send(request)
// some ü are corrupted for some reason
Ok(string.replace(response.body, "<EFBFBD>", "ü"))
}
fn read_csv(contents) -> List(List(String)) {
contents
// the file doesn't use quotes, so this is fine
|> string.split(on: "\n")
// drop CSV header
|> list.drop(1)
|> list.map(fn(a) { string.split(a, on: ";") })
}
pub fn main() { pub fn main() {
let assert Ok(bahn_ril100) = fetch_data() let assert Ok(bahn_ril100) = fetch_data()
let baseurl = "https://bahnhof.name"
let stations = read_csv(bahn_ril100) let ds100s = read_csv(bahn_ril100)
|> list.filter_map(fn(fields) { |> list.filter_map(fn(fields) {
case fields { case fields {
[_, ds100, name, ..] -> Ok(#(name, ds100)) [_, ds100, name, ..] -> Ok(#(name, ds100))
_ -> Error(fields) _ -> Error(fields)
} }
}) })
let stationmap =
stations
|> map.from_list
let ds100map =
stations
|> list.map(fn(a) { #(a.1, a.0) })
|> map.from_list
let ref = atom.create_from_string("ref")
let index = index_new(atom.create_from_string("stations"))
|> index_ref(field_term("id"))
|> index_field(field_new("name"))
|> index_add(stations
|> list.map(fn(tuple) {case tuple {
#(name, ds100)
-> map.from_list([#("id", ds100), #("name", name)]
)}}))
let assert Ok(leitpunkte_raw) = file.read("data/leitpunkte.csv") let assert Ok(leitpunkte_raw) = file.read("data/leitpunkte.csv")
let leitpunkte = let leitpunkte =
read_csv(leitpunkte_raw) read_csv(leitpunkte_raw)
|> list.filter_map(fn(fields) { |> list.filter_map(fn(fields) {
case fields { case fields {
[lp, _, ds100] -> Ok(#(lp, ds100)) [lp, name, _ds100] -> Ok(#(name, lp))
_ -> Error(fields) _ -> Error(fields)
} }
}) })
|> map.from_list
let fuzzy = fn(searchterm: String) -> List(String) { let name_to_ds100 = map.from_list(ds100s)
let name_to_leitpunkt = map.from_list(leitpunkte)
let ds100_to_name = map.from_list(list.map(ds100s, swap))
let leitpunkt_to_name = map.from_list(list.map(leitpunkte, swap))
let ds100index = index_new(atom.create_from_string("ds100"))
|> index_ref(field_term("id"))
|> index_field(field_new("name"))
|> index_add(ds100s
|> list.map(fn(tuple) {case tuple {
#(name, ds100)
-> map.from_list([#("id", ds100), #("name", name)]
)}}))
let leitpunkt_index = index_new(atom.create_from_string("leitpunkt"))
|> index_ref(field_term("id"))
|> index_field(field_new("name"))
|> index_add(leitpunkte
|> list.map(fn(tuple) {case tuple {
#(name, leitpunkt)
-> map.from_list([#("id", leitpunkt), #("name", name)]
)}}))
let ref = atom.create_from_string("ref")
let fuzzy = fn(searchterm: String, kind: IdKind) -> List(String) {
let query = query_new() let query = query_new()
let index = case kind {
DS100 -> ds100index
Leitpunkt -> leitpunkt_index
}
let match = atom.create_from_string("match") let match = atom.create_from_string("match")
let field = atom.create_from_string("field") let field = atom.create_from_string("field")
let term = atom.create_from_string("term") let term = atom.create_from_string("term")
@ -282,12 +269,59 @@ pub fn main() {
} }
} }
io.println("compiled index, starting server …") let exact_then_fuzzy = fn(searchterm: String, kind: IdKind) -> Matched(String) {
let #(stations, ids) = case kind {
DS100 -> #(name_to_ds100, ds100_to_name)
Leitpunkt -> #(name_to_leitpunkt, leitpunkt_to_name)
}
case map.get(stations, searchterm) {
Ok(id) -> Exact(id)
_ -> {
let results = fuzzy(searchterm, kind)
|> list.filter_map(fn (res) {
map.get(ids, string.uppercase(res))
})
case results {
[res] -> Fuzzy(res)
[res, ..] -> Fuzzy(res)
_ -> Failed
}
}
}
}
io.println("compiled indices, starting server …")
let _ = mist.run_service( let _ = mist.run_service(
2345, 2345,
fn(req) { lookup_station(req, stationmap, ds100map, leitpunkte, baseurl, fuzzy) }, fn(req) { lookup_station(
req,
ds100_to_name,
leitpunkt_to_name,
exact_then_fuzzy
) },
max_body_limit: 100, max_body_limit: 100,
) )
process.sleep_forever() process.sleep_forever()
} }
fn fetch_data() -> Result(String, hackney.Error) {
let assert Ok(uri) =
uri.parse(
"https://download-data.deutschebahn.com/static/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2021-10.csv",
)
let assert Ok(request) = request.from_uri(uri)
let assert Ok(response) = hackney.send(request)
// some ü are corrupted for some reason
Ok(string.replace(response.body, "<EFBFBD>", "ü"))
}
fn read_csv(contents) -> List(List(String)) {
contents
// the file doesn't use quotes, so this is fine
|> string.split(on: "\n")
// drop CSV header
|> list.drop(1)
|> list.map(fn(a) { string.split(a, on: ";") })
}