separate domains, fuzzy matching all-round
This commit is contained in:
parent
128198f070
commit
6a61154bda
1 changed files with 138 additions and 104 deletions
|
@ -1,5 +1,5 @@
|
||||||
import gleam/http/response.{Response}
|
import gleam/http/response.{Response}
|
||||||
import gleam/http/request.{Request}
|
import gleam/http/request.{Request,get_header}
|
||||||
import gleam/http.{Get}
|
import gleam/http.{Get}
|
||||||
import gleam/bit_builder.{BitBuilder}
|
import gleam/bit_builder.{BitBuilder}
|
||||||
import gleam/erlang/process
|
import gleam/erlang/process
|
||||||
|
@ -13,8 +13,15 @@ import gleam/list
|
||||||
import gleam/map.{Map}
|
import gleam/map.{Map}
|
||||||
import gleam/uri
|
import gleam/uri
|
||||||
import gleam/hackney
|
import gleam/hackney
|
||||||
|
import gleam/pair.{swap}
|
||||||
import mist
|
import mist
|
||||||
|
|
||||||
|
const ds100_domain = "ds100.bahnhof.name"
|
||||||
|
const ril100_domain = "ril100.bahnhof.name"
|
||||||
|
const leitpunkt_domain = "leitpunkt.bahnhof.name"
|
||||||
|
const domain = "bahnhof.name"
|
||||||
|
const proto = "https://"
|
||||||
|
|
||||||
external type Index
|
external type Index
|
||||||
external type Field
|
external type Field
|
||||||
|
|
||||||
|
@ -36,9 +43,6 @@ external fn field_new(String) -> Field =
|
||||||
external fn index_add(Index, List(a)) -> Index =
|
external fn index_add(Index, List(a)) -> Index =
|
||||||
"Elixir.Haystack.Index" "add"
|
"Elixir.Haystack.Index" "add"
|
||||||
|
|
||||||
external fn index_search(Index, String) -> List(Map(atom.Atom, String)) =
|
|
||||||
"Elixir.Haystack.Index" "search"
|
|
||||||
|
|
||||||
pub external fn inspect(a) -> a =
|
pub external fn inspect(a) -> a =
|
||||||
"Elixir.IO" "inspect"
|
"Elixir.IO" "inspect"
|
||||||
|
|
||||||
|
@ -61,6 +65,17 @@ external fn query_expression_new(atom.Atom, List(#(atom.Atom, String))) -> Expre
|
||||||
external fn tokenize(String) -> List(Map(atom.Atom, String)) =
|
external fn tokenize(String) -> List(Map(atom.Atom, String)) =
|
||||||
"Elixir.Haystack.Tokenizer" "tokenize"
|
"Elixir.Haystack.Tokenizer" "tokenize"
|
||||||
|
|
||||||
|
type IdKind {
|
||||||
|
DS100
|
||||||
|
Leitpunkt
|
||||||
|
}
|
||||||
|
|
||||||
|
type Matched(t) {
|
||||||
|
Exact(t)
|
||||||
|
Fuzzy(t)
|
||||||
|
Failed
|
||||||
|
}
|
||||||
|
|
||||||
fn unpercent(encoded: String) -> String {
|
fn unpercent(encoded: String) -> String {
|
||||||
let #([head], chunks) =
|
let #([head], chunks) =
|
||||||
encoded
|
encoded
|
||||||
|
@ -96,43 +111,30 @@ fn lookup_exact(query: String, lookup: Map(String, String)) -> #(Int, String) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Looks up a station by its name, with fuzzy matching.
|
/// Looks up a station by its name, with fuzzy matching.
|
||||||
fn lookup_by_name(
|
fn lookup_fuzzy(
|
||||||
query: String,
|
query: String,
|
||||||
stations: Map(String, String),
|
kind: IdKind,
|
||||||
ds100s: Map(String, String),
|
fuzzy: fn(String, IdKind) -> Matched(String)
|
||||||
fuzzy: fn(String) -> List(String)
|
|
||||||
) -> #(Int, String) {
|
) -> #(Int, String) {
|
||||||
io.println(query)
|
case fuzzy(query, kind) {
|
||||||
|
Exact(res) -> #(200, res)
|
||||||
case map.get(stations, query) {
|
Fuzzy(res) -> #(302, res)
|
||||||
Ok(ds100) -> #(200, ds100)
|
Failed -> #(404, "??")
|
||||||
_ -> {
|
|
||||||
let results = fuzzy(query)
|
|
||||||
|> list.filter_map(fn (res) { map.get(ds100s, string.uppercase(res)) })
|
|
||||||
case results {
|
|
||||||
// results -> {
|
|
||||||
// let names = results
|
|
||||||
// |> list.map (fn (res) {
|
|
||||||
// map.get(ds100s, string.uppercase(res))
|
|
||||||
// |> result.map(fn(a) { "/" <> a })
|
|
||||||
// |> result.unwrap("/")})
|
|
||||||
// #(200, string.join(names, "\n"))
|
|
||||||
// }
|
|
||||||
[res] -> #(302, res)
|
|
||||||
[res, ..] -> #(302, res)
|
|
||||||
_ -> #(404, "??")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn if_not(res: #(Int,t), fallback: fn() -> #(Int,t)) -> #(Int,t) {
|
||||||
|
inspect(case res {
|
||||||
|
#(200, _) -> res
|
||||||
|
_ -> fallback()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
fn lookup_station(
|
fn lookup_station(
|
||||||
request: Request(t),
|
request: Request(t),
|
||||||
stations: Map(String, String),
|
ds100_to_name: Map(String, String),
|
||||||
ds100s: Map(String, String),
|
leitpunkt_to_name: Map(String, String),
|
||||||
leitpunkte: Map(String, String),
|
fuzzy: fn (String, IdKind) -> Matched(String)
|
||||||
baseurl: String,
|
|
||||||
fuzzy: fn (String) -> List(String)
|
|
||||||
) -> Response(BitBuilder) {
|
) -> Response(BitBuilder) {
|
||||||
let #(code, text) = case request {
|
let #(code, text) = case request {
|
||||||
// blackhole favicon.ico requests instead of using the index
|
// blackhole favicon.ico requests instead of using the index
|
||||||
|
@ -140,35 +142,33 @@ fn lookup_station(
|
||||||
Request(method: Get, path: "/help", ..)
|
Request(method: Get, path: "/help", ..)
|
||||||
| Request(method: Get, path: "/", ..) -> #(
|
| Request(method: Get, path: "/", ..) -> #(
|
||||||
200,
|
200,
|
||||||
"ds100 → Name: " <> baseurl <> "/NN\n" <> "Name → ds100: " <> baseurl <> "/Nürnberg Hbf",
|
"ril100 → Name: " <> proto<>ril100_domain<>"/HG\n" <>
|
||||||
|
"Name → ril100: " <> proto<>ril100_domain <> "/Göttingen\n\n" <>
|
||||||
|
"Leitpunkt → Name: " <> proto<>leitpunkt_domain<>"/GOE\n" <>
|
||||||
|
"Name → Leitpunkt: " <> proto<>leitpunkt_domain <> "/Göttingen\n\n"<>
|
||||||
|
"Fuzzy:" <> proto<>domain<>"/...",
|
||||||
)
|
)
|
||||||
Request(method: Get, path: "/ds100/" <> path, ..) ->
|
|
||||||
path
|
|
||||||
|> unpercent
|
|
||||||
|> string.uppercase
|
|
||||||
|> lookup_exact(ds100s)
|
|
||||||
Request(method: Get, path: "/name/" <> path, ..) ->
|
|
||||||
path
|
|
||||||
|> unpercent
|
|
||||||
|> lookup_by_name(stations, ds100s, fuzzy)
|
|
||||||
Request(method: Get, path: "/leitpunkt/" <> path, ..) ->
|
|
||||||
path
|
|
||||||
|> unpercent
|
|
||||||
|> string.uppercase
|
|
||||||
|> lookup_exact(leitpunkte)
|
|
||||||
Request(method: Get, path: "/" <> path, ..) -> {
|
Request(method: Get, path: "/" <> path, ..) -> {
|
||||||
let path = unpercent(path)
|
let query = unpercent(path)
|
||||||
|
case get_header(request, "x-forwarded-host") {
|
||||||
let by_ds100 = lookup_exact(path, ds100s)
|
Ok(domain) if domain == leitpunkt_domain -> query
|
||||||
let by_lp = lookup_exact(path, leitpunkte)
|
|> lookup_exact(leitpunkt_to_name)
|
||||||
|
|> if_not(fn() {lookup_fuzzy(query,Leitpunkt,fuzzy)})
|
||||||
case #(by_ds100.0, by_lp.0) {
|
Ok(domain) if domain == ril100_domain || domain == ds100_domain -> query
|
||||||
#(200, _) -> by_ds100
|
|> lookup_exact(ds100_to_name)
|
||||||
#(_, 200) -> by_lp
|
|> if_not(fn() {lookup_fuzzy(query,DS100, fuzzy)})
|
||||||
_ -> lookup_by_name(path, stations, ds100s, fuzzy)
|
_ -> {
|
||||||
|
let by_ds100 = lookup_exact(query, ds100_to_name)
|
||||||
|
let by_lp = lookup_exact(query, leitpunkt_to_name)
|
||||||
|
case #(by_ds100.0, by_lp.0) {
|
||||||
|
#(200, _) -> #(302, proto<>ril100_domain<>"/"<>path)
|
||||||
|
#(_, 200) -> #(302, proto<>leitpunkt_domain<>"/"<>path)
|
||||||
|
_ -> #(302, proto<>ril100_domain<>"/"<>path)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_ -> #(404, "intended usage is e.g. curl " <> baseurl <> "/FF")
|
_ -> #(404, "intended usage is e.g. curl " <> proto<>domain<>"/FF")
|
||||||
}
|
}
|
||||||
let body = bit_builder.from_string(text)
|
let body = bit_builder.from_string(text)
|
||||||
|
|
||||||
|
@ -189,67 +189,54 @@ fn lookup_station(
|
||||||
|> response.set_body(body)
|
|> response.set_body(body)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fetch_data() -> Result(String, hackney.Error) {
|
|
||||||
let assert Ok(uri) =
|
|
||||||
uri.parse(
|
|
||||||
"https://download-data.deutschebahn.com/static/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2021-10.csv",
|
|
||||||
)
|
|
||||||
let assert Ok(request) = request.from_uri(uri)
|
|
||||||
let assert Ok(response) = hackney.send(request)
|
|
||||||
|
|
||||||
// some ü are corrupted for some reason
|
|
||||||
Ok(string.replace(response.body, "<EFBFBD>", "ü"))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn read_csv(contents) -> List(List(String)) {
|
|
||||||
contents
|
|
||||||
// the file doesn't use quotes, so this is fine
|
|
||||||
|> string.split(on: "\n")
|
|
||||||
// drop CSV header
|
|
||||||
|> list.drop(1)
|
|
||||||
|> list.map(fn(a) { string.split(a, on: ";") })
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn main() {
|
pub fn main() {
|
||||||
let assert Ok(bahn_ril100) = fetch_data()
|
let assert Ok(bahn_ril100) = fetch_data()
|
||||||
let baseurl = "https://bahnhof.name"
|
|
||||||
let stations = read_csv(bahn_ril100)
|
let ds100s = read_csv(bahn_ril100)
|
||||||
|> list.filter_map(fn(fields) {
|
|> list.filter_map(fn(fields) {
|
||||||
case fields {
|
case fields {
|
||||||
[_, ds100, name, ..] -> Ok(#(name, ds100))
|
[_, ds100, name, ..] -> Ok(#(name, ds100))
|
||||||
_ -> Error(fields)
|
_ -> Error(fields)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
let stationmap =
|
|
||||||
stations
|
|
||||||
|> map.from_list
|
|
||||||
let ds100map =
|
|
||||||
stations
|
|
||||||
|> list.map(fn(a) { #(a.1, a.0) })
|
|
||||||
|> map.from_list
|
|
||||||
let ref = atom.create_from_string("ref")
|
|
||||||
let index = index_new(atom.create_from_string("stations"))
|
|
||||||
|> index_ref(field_term("id"))
|
|
||||||
|> index_field(field_new("name"))
|
|
||||||
|> index_add(stations
|
|
||||||
|> list.map(fn(tuple) {case tuple {
|
|
||||||
#(name, ds100)
|
|
||||||
-> map.from_list([#("id", ds100), #("name", name)]
|
|
||||||
)}}))
|
|
||||||
|
|
||||||
let assert Ok(leitpunkte_raw) = file.read("data/leitpunkte.csv")
|
let assert Ok(leitpunkte_raw) = file.read("data/leitpunkte.csv")
|
||||||
let leitpunkte =
|
let leitpunkte =
|
||||||
read_csv(leitpunkte_raw)
|
read_csv(leitpunkte_raw)
|
||||||
|> list.filter_map(fn(fields) {
|
|> list.filter_map(fn(fields) {
|
||||||
case fields {
|
case fields {
|
||||||
[lp, _, ds100] -> Ok(#(lp, ds100))
|
[lp, name, _ds100] -> Ok(#(name, lp))
|
||||||
_ -> Error(fields)
|
_ -> Error(fields)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|> map.from_list
|
|
||||||
|
|
||||||
let fuzzy = fn(searchterm: String) -> List(String) {
|
let name_to_ds100 = map.from_list(ds100s)
|
||||||
|
let name_to_leitpunkt = map.from_list(leitpunkte)
|
||||||
|
let ds100_to_name = map.from_list(list.map(ds100s, swap))
|
||||||
|
let leitpunkt_to_name = map.from_list(list.map(leitpunkte, swap))
|
||||||
|
let ds100index = index_new(atom.create_from_string("ds100"))
|
||||||
|
|> index_ref(field_term("id"))
|
||||||
|
|> index_field(field_new("name"))
|
||||||
|
|> index_add(ds100s
|
||||||
|
|> list.map(fn(tuple) {case tuple {
|
||||||
|
#(name, ds100)
|
||||||
|
-> map.from_list([#("id", ds100), #("name", name)]
|
||||||
|
)}}))
|
||||||
|
let leitpunkt_index = index_new(atom.create_from_string("leitpunkt"))
|
||||||
|
|> index_ref(field_term("id"))
|
||||||
|
|> index_field(field_new("name"))
|
||||||
|
|> index_add(leitpunkte
|
||||||
|
|> list.map(fn(tuple) {case tuple {
|
||||||
|
#(name, leitpunkt)
|
||||||
|
-> map.from_list([#("id", leitpunkt), #("name", name)]
|
||||||
|
)}}))
|
||||||
|
|
||||||
|
let ref = atom.create_from_string("ref")
|
||||||
|
let fuzzy = fn(searchterm: String, kind: IdKind) -> List(String) {
|
||||||
let query = query_new()
|
let query = query_new()
|
||||||
|
let index = case kind {
|
||||||
|
DS100 -> ds100index
|
||||||
|
Leitpunkt -> leitpunkt_index
|
||||||
|
}
|
||||||
let match = atom.create_from_string("match")
|
let match = atom.create_from_string("match")
|
||||||
let field = atom.create_from_string("field")
|
let field = atom.create_from_string("field")
|
||||||
let term = atom.create_from_string("term")
|
let term = atom.create_from_string("term")
|
||||||
|
@ -282,12 +269,59 @@ pub fn main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
io.println("compiled index, starting server …")
|
let exact_then_fuzzy = fn(searchterm: String, kind: IdKind) -> Matched(String) {
|
||||||
|
let #(stations, ids) = case kind {
|
||||||
|
DS100 -> #(name_to_ds100, ds100_to_name)
|
||||||
|
Leitpunkt -> #(name_to_leitpunkt, leitpunkt_to_name)
|
||||||
|
}
|
||||||
|
case map.get(stations, searchterm) {
|
||||||
|
Ok(id) -> Exact(id)
|
||||||
|
_ -> {
|
||||||
|
let results = fuzzy(searchterm, kind)
|
||||||
|
|> list.filter_map(fn (res) {
|
||||||
|
map.get(ids, string.uppercase(res))
|
||||||
|
})
|
||||||
|
case results {
|
||||||
|
[res] -> Fuzzy(res)
|
||||||
|
[res, ..] -> Fuzzy(res)
|
||||||
|
_ -> Failed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
io.println("compiled indices, starting server …")
|
||||||
|
|
||||||
let _ = mist.run_service(
|
let _ = mist.run_service(
|
||||||
2345,
|
2345,
|
||||||
fn(req) { lookup_station(req, stationmap, ds100map, leitpunkte, baseurl, fuzzy) },
|
fn(req) { lookup_station(
|
||||||
|
req,
|
||||||
|
ds100_to_name,
|
||||||
|
leitpunkt_to_name,
|
||||||
|
exact_then_fuzzy
|
||||||
|
) },
|
||||||
max_body_limit: 100,
|
max_body_limit: 100,
|
||||||
)
|
)
|
||||||
process.sleep_forever()
|
process.sleep_forever()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn fetch_data() -> Result(String, hackney.Error) {
|
||||||
|
let assert Ok(uri) =
|
||||||
|
uri.parse(
|
||||||
|
"https://download-data.deutschebahn.com/static/datasets/betriebsstellen/DBNetz-Betriebsstellenverzeichnis-Stand2021-10.csv",
|
||||||
|
)
|
||||||
|
let assert Ok(request) = request.from_uri(uri)
|
||||||
|
let assert Ok(response) = hackney.send(request)
|
||||||
|
|
||||||
|
// some ü are corrupted for some reason
|
||||||
|
Ok(string.replace(response.body, "<EFBFBD>", "ü"))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_csv(contents) -> List(List(String)) {
|
||||||
|
contents
|
||||||
|
// the file doesn't use quotes, so this is fine
|
||||||
|
|> string.split(on: "\n")
|
||||||
|
// drop CSV header
|
||||||
|
|> list.drop(1)
|
||||||
|
|> list.map(fn(a) { string.split(a, on: ";") })
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue