separate domains, fuzzy matching all-round
This commit is contained in:
parent
128198f070
commit
6a61154bda
1 changed files with 138 additions and 104 deletions
|
@ -1,5 +1,5 @@
|
|||
import gleam/http/response.{Response}
|
||||
import gleam/http/request.{Request}
|
||||
import gleam/http/request.{Request,get_header}
|
||||
import gleam/http.{Get}
|
||||
import gleam/bit_builder.{BitBuilder}
|
||||
import gleam/erlang/process
|
||||
|
@ -13,8 +13,15 @@ import gleam/list
|
|||
import gleam/map.{Map}
|
||||
import gleam/uri
|
||||
import gleam/hackney
|
||||
import gleam/pair.{swap}
|
||||
import mist
|
||||
|
||||
const ds100_domain = "ds100.bahnhof.name"
|
||||
const ril100_domain = "ril100.bahnhof.name"
|
||||
const leitpunkt_domain = "leitpunkt.bahnhof.name"
|
||||
const domain = "bahnhof.name"
|
||||
const proto = "https://"
|
||||
|
||||
external type Index
|
||||
external type Field
|
||||
|
||||
|
@ -36,9 +43,6 @@ external fn field_new(String) -> Field =
|
|||
external fn index_add(Index, List(a)) -> Index =
|
||||
"Elixir.Haystack.Index" "add"
|
||||
|
||||
external fn index_search(Index, String) -> List(Map(atom.Atom, String)) =
|
||||
"Elixir.Haystack.Index" "search"
|
||||
|
||||
pub external fn inspect(a) -> a =
|
||||
"Elixir.IO" "inspect"
|
||||
|
||||
|
@ -61,6 +65,17 @@ external fn query_expression_new(atom.Atom, List(#(atom.Atom, String))) -> Expre
|
|||
external fn tokenize(String) -> List(Map(atom.Atom, String)) =
|
||||
"Elixir.Haystack.Tokenizer" "tokenize"
|
||||
|
||||
type IdKind {
|
||||
DS100
|
||||
Leitpunkt
|
||||
}
|
||||
|
||||
type Matched(t) {
|
||||
Exact(t)
|
||||
Fuzzy(t)
|
||||
Failed
|
||||
}
|
||||
|
||||
fn unpercent(encoded: String) -> String {
|
||||
let #([head], chunks) =
|
||||
encoded
|
||||
|
@ -96,43 +111,30 @@ fn lookup_exact(query: String, lookup: Map(String, String)) -> #(Int, String) {
|
|||
}
|
||||
|
||||
/// Looks up a station by its name, with fuzzy matching.
|
||||
fn lookup_by_name(
|
||||
fn lookup_fuzzy(
|
||||
query: String,
|
||||
stations: Map(String, String),
|
||||
ds100s: Map(String, String),
|
||||
fuzzy: fn(String) -> List(String)
|
||||
kind: IdKind,
|
||||
fuzzy: fn(String, IdKind) -> Matched(String)
|
||||
) -> #(Int, String) {
|
||||
io.println(query)
|
||||
|
||||
case map.get(stations, query) {
|
||||
Ok(ds100) -> #(200, ds100)
|
||||
_ -> {
|
||||
let results = fuzzy(query)
|
||||
|> list.filter_map(fn (res) { map.get(ds100s, string.uppercase(res)) })
|
||||
case results {
|
||||
// results -> {
|
||||
// let names = results
|
||||
// |> list.map (fn (res) {
|
||||
// map.get(ds100s, string.uppercase(res))
|
||||
// |> result.map(fn(a) { "/" <> a })
|
||||
// |> result.unwrap("/")})
|
||||
// #(200, string.join(names, "\n"))
|
||||
// }
|
||||
[res] -> #(302, res)
|
||||
[res, ..] -> #(302, res)
|
||||
_ -> #(404, "??")
|
||||
}
|
||||
}
|
||||
case fuzzy(query, kind) {
|
||||
Exact(res) -> #(200, res)
|
||||
Fuzzy(res) -> #(302, res)
|
||||
Failed -> #(404, "??")
|
||||
}
|
||||
}
|
||||
|
||||
fn if_not(res: #(Int,t), fallback: fn() -> #(Int,t)) -> #(Int,t) {
|
||||
inspect(case res {
|
||||
#(200, _) -> res
|
||||
_ -> fallback()
|
||||
})
|
||||
}
|
||||
|
||||
fn lookup_station(
|
||||
request: Request(t),
|
||||
stations: Map(String, String),
|
||||
ds100s: Map(String, String),
|
||||
leitpunkte: Map(String, String),
|
||||
baseurl: String,
|
||||
fuzzy: fn (String) -> List(String)
|
||||
ds100_to_name: Map(String, String),
|
||||
leitpunkt_to_name: Map(String, String),
|
||||
fuzzy: fn (String, IdKind) -> Matched(String)
|
||||
) -> Response(BitBuilder) {
|
||||
let #(code, text) = case request {
|
||||
// blackhole favicon.ico requests instead of using the index
|
||||
|
@ -140,35 +142,33 @@ fn lookup_station(
|
|||
Request(method: Get, path: "/help", ..)
|
||||
| Request(method: Get, path: "/", ..) -> #(
|
||||
200,
|
||||
"ds100 → Name: " <> baseurl <> "/NN\n" <> "Name → ds100: " <> baseurl <> "/Nürnberg Hbf",
|
||||
"ril100 → Name: " <> proto<>ril100_domain<>"/HG\n" <>
|
||||
"Name → ril100: " <> proto<>ril100_domain <> "/Göttingen\n\n" <>
|
||||
"Leitpunkt → Name: " <> proto<>leitpunkt_domain<>"/GOE\n" <>
|
||||
"Name → Leitpunkt: " <> proto<>leitpunkt_domain <> "/Göttingen\n\n"<>
|
||||
"Fuzzy:" <> proto<>domain<>"/...",
|
||||
)
|
||||
Request(method: Get, path: "/ds100/" <> path, ..) ->
|
||||
path
|
||||
|> unpercent
|
||||
|> string.uppercase
|
||||
|> lookup_exact(ds100s)
|
||||
Request(method: Get, path: "/name/" <> path, ..) ->
|
||||
path
|
||||
|> unpercent
|
||||
|> lookup_by_name(stations, ds100s, fuzzy)
|
||||
Request(method: Get, path: "/leitpunkt/" <> path, ..) ->
|
||||
path
|
||||
|> unpercent
|
||||
|> string.uppercase
|
||||
|> lookup_exact(leitpunkte)
|
||||
Request(method: Get, path: "/" <> path, ..) -> {
|
||||
let path = unpercent(path)
|
||||
|
||||
let by_ds100 = lookup_exact(path, ds100s)
|
||||
let by_lp = lookup_exact(path, leitpunkte)
|
||||
|
||||
case #(by_ds100.0, by_lp.0) {
|
||||
#(200, _) -> by_ds100
|
||||
#(_, 200) -> by_lp
|
||||
_ -> lookup_by_name(path, stations, ds100s, fuzzy)
|
||||
let query = unpercent(path)
|
||||
case get_header(request, "x-forwarded-host") {
|
||||
Ok(domain) if domain == leitpunkt_domain -> query
|
||||
|> lookup_exact(leitpunkt_to_name)
|
||||
|> if_not(fn() {lookup_fuzzy(query,Leitpunkt,fuzzy)})
|
||||
Ok(domain) if domain == ril100_domain || domain == ds100_domain -> query
|
||||
|> lookup_exact(ds100_to_name)
|
||||
|> if_not(fn() {lookup_fuzzy(query,DS100, fuzzy)})
|
||||
_ -> {
|
||||
let by_ds100 = lookup_exact(query, ds100_to_name)
|
||||
let by_lp = lookup_exact(query, leitpunkt_to_name)
|
||||
case #(by_ds100.0, by_lp.0) {
|
||||
#(200, _) -> #(302, proto<>ril100_domain<>"/"<>path)
|
||||
#(_, 200) -> #(302, proto<>leitpunkt_domain<>"/"<>path)
|
||||
_ -> #(302, proto<>ril100_domain<>"/"<>path)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ -> #(404, "intended usage is e.g. curl " <> baseurl <> "/FF")
|
||||
_ -> #(404, "intended usage is e.g. curl " <> proto<>domain<>"/FF")
|
||||
}
|
||||
let body = bit_builder.from_string(text)
|
||||
|
||||
|
@ -189,67 +189,54 @@ fn lookup_station(
|
|||
|> response.set_body(body)
|
||||
}
|
||||
|
||||
fn fetch_data() -> Result(String, hackney.Error) {
|
||||
let assert Ok(uri) =
|
||||
uri.parse(
|
||||
"https://download-data.deutschebahn.com/static/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2021-10.csv",
|
||||
)
|
||||
let assert Ok(request) = request.from_uri(uri)
|
||||
let assert Ok(response) = hackney.send(request)
|
||||
|
||||
// some ü are corrupted for some reason
|
||||
Ok(string.replace(response.body, "<EFBFBD>", "ü"))
|
||||
}
|
||||
|
||||
fn read_csv(contents) -> List(List(String)) {
|
||||
contents
|
||||
// the file doesn't use quotes, so this is fine
|
||||
|> string.split(on: "\n")
|
||||
// drop CSV header
|
||||
|> list.drop(1)
|
||||
|> list.map(fn(a) { string.split(a, on: ";") })
|
||||
}
|
||||
|
||||
pub fn main() {
|
||||
let assert Ok(bahn_ril100) = fetch_data()
|
||||
let baseurl = "https://bahnhof.name"
|
||||
let stations = read_csv(bahn_ril100)
|
||||
|
||||
let ds100s = read_csv(bahn_ril100)
|
||||
|> list.filter_map(fn(fields) {
|
||||
case fields {
|
||||
[_, ds100, name, ..] -> Ok(#(name, ds100))
|
||||
_ -> Error(fields)
|
||||
}
|
||||
})
|
||||
let stationmap =
|
||||
stations
|
||||
|> map.from_list
|
||||
let ds100map =
|
||||
stations
|
||||
|> list.map(fn(a) { #(a.1, a.0) })
|
||||
|> map.from_list
|
||||
let ref = atom.create_from_string("ref")
|
||||
let index = index_new(atom.create_from_string("stations"))
|
||||
|> index_ref(field_term("id"))
|
||||
|> index_field(field_new("name"))
|
||||
|> index_add(stations
|
||||
|> list.map(fn(tuple) {case tuple {
|
||||
#(name, ds100)
|
||||
-> map.from_list([#("id", ds100), #("name", name)]
|
||||
)}}))
|
||||
|
||||
let assert Ok(leitpunkte_raw) = file.read("data/leitpunkte.csv")
|
||||
let leitpunkte =
|
||||
read_csv(leitpunkte_raw)
|
||||
|> list.filter_map(fn(fields) {
|
||||
case fields {
|
||||
[lp, _, ds100] -> Ok(#(lp, ds100))
|
||||
[lp, name, _ds100] -> Ok(#(name, lp))
|
||||
_ -> Error(fields)
|
||||
}
|
||||
})
|
||||
|> map.from_list
|
||||
|
||||
let fuzzy = fn(searchterm: String) -> List(String) {
|
||||
let name_to_ds100 = map.from_list(ds100s)
|
||||
let name_to_leitpunkt = map.from_list(leitpunkte)
|
||||
let ds100_to_name = map.from_list(list.map(ds100s, swap))
|
||||
let leitpunkt_to_name = map.from_list(list.map(leitpunkte, swap))
|
||||
let ds100index = index_new(atom.create_from_string("ds100"))
|
||||
|> index_ref(field_term("id"))
|
||||
|> index_field(field_new("name"))
|
||||
|> index_add(ds100s
|
||||
|> list.map(fn(tuple) {case tuple {
|
||||
#(name, ds100)
|
||||
-> map.from_list([#("id", ds100), #("name", name)]
|
||||
)}}))
|
||||
let leitpunkt_index = index_new(atom.create_from_string("leitpunkt"))
|
||||
|> index_ref(field_term("id"))
|
||||
|> index_field(field_new("name"))
|
||||
|> index_add(leitpunkte
|
||||
|> list.map(fn(tuple) {case tuple {
|
||||
#(name, leitpunkt)
|
||||
-> map.from_list([#("id", leitpunkt), #("name", name)]
|
||||
)}}))
|
||||
|
||||
let ref = atom.create_from_string("ref")
|
||||
let fuzzy = fn(searchterm: String, kind: IdKind) -> List(String) {
|
||||
let query = query_new()
|
||||
let index = case kind {
|
||||
DS100 -> ds100index
|
||||
Leitpunkt -> leitpunkt_index
|
||||
}
|
||||
let match = atom.create_from_string("match")
|
||||
let field = atom.create_from_string("field")
|
||||
let term = atom.create_from_string("term")
|
||||
|
@ -282,12 +269,59 @@ pub fn main() {
|
|||
}
|
||||
}
|
||||
|
||||
io.println("compiled index, starting server …")
|
||||
let exact_then_fuzzy = fn(searchterm: String, kind: IdKind) -> Matched(String) {
|
||||
let #(stations, ids) = case kind {
|
||||
DS100 -> #(name_to_ds100, ds100_to_name)
|
||||
Leitpunkt -> #(name_to_leitpunkt, leitpunkt_to_name)
|
||||
}
|
||||
case map.get(stations, searchterm) {
|
||||
Ok(id) -> Exact(id)
|
||||
_ -> {
|
||||
let results = fuzzy(searchterm, kind)
|
||||
|> list.filter_map(fn (res) {
|
||||
map.get(ids, string.uppercase(res))
|
||||
})
|
||||
case results {
|
||||
[res] -> Fuzzy(res)
|
||||
[res, ..] -> Fuzzy(res)
|
||||
_ -> Failed
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
io.println("compiled indices, starting server …")
|
||||
|
||||
let _ = mist.run_service(
|
||||
2345,
|
||||
fn(req) { lookup_station(req, stationmap, ds100map, leitpunkte, baseurl, fuzzy) },
|
||||
fn(req) { lookup_station(
|
||||
req,
|
||||
ds100_to_name,
|
||||
leitpunkt_to_name,
|
||||
exact_then_fuzzy
|
||||
) },
|
||||
max_body_limit: 100,
|
||||
)
|
||||
process.sleep_forever()
|
||||
}
|
||||
|
||||
fn fetch_data() -> Result(String, hackney.Error) {
|
||||
let assert Ok(uri) =
|
||||
uri.parse(
|
||||
"https://download-data.deutschebahn.com/static/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2021-10.csv",
|
||||
)
|
||||
let assert Ok(request) = request.from_uri(uri)
|
||||
let assert Ok(response) = hackney.send(request)
|
||||
|
||||
// some ü are corrupted for some reason
|
||||
Ok(string.replace(response.body, "<EFBFBD>", "ü"))
|
||||
}
|
||||
|
||||
fn read_csv(contents) -> List(List(String)) {
|
||||
contents
|
||||
// the file doesn't use quotes, so this is fine
|
||||
|> string.split(on: "\n")
|
||||
// drop CSV header
|
||||
|> list.drop(1)
|
||||
|> list.map(fn(a) { string.split(a, on: ";") })
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue