Standalone (code) • tilt.company.match

If you can’t install the tilt.company.match package do this:

Ensure to install the required packages:

install.packages("dplyr")
install.packages("glue")
install.packages("magrittr")
install.packages("purrr")
install.packages("rlang")
install.packages("stringdist")
install.packages("stringi")
install.packages("tibble")
install.packages("vctrs")
install.packages("vroom")

Copy-paste the code below into a “standalone.R” file.
Replace library(tilt.company.match) with source("sandalone.R").
Replace each example_file("demo_*.csv") with the corresponding file here.

standalone.R

#' Aborts when data has missing values on non-nullable columns
#'
#' @param data Tibble holding a result data set.
#' @param non_nullable_cols A character vector holding names of columns on which
#'   `NA`s are not allowed.
#'
#' @return Input `data` invisibly.
#' @export
#'
#' @examples
#' data <- tibble::tibble(x = NA, y = 1, z = NA)
#'
#' # With NA in nullable columns returns data invisibly
#' data %>% abort_if_incomplete(non_nullable_cols = "y")
#' out <- data %>% abort_if_incomplete(non_nullable_cols = "y")
#' identical(out, data)
#'
#' # With NA in one nullable column, alerts the column to review as an error
#' data %>%
#'   abort_if_incomplete(non_nullable_cols = c("x", "y")) %>%
#'   try()
#'
#' # By default, it takes all columns as non-nullable
#' data %>%
#'   abort_if_incomplete() %>%
#'   try()
#' @keywords internal
abort_if_incomplete <- function(data, non_nullable_cols = names(data)) {
  incomplete <- select_incomplete(data[non_nullable_cols])
  if (any(incomplete)) {
    cols <- toString(names(incomplete[incomplete]))
    rlang::abort(c(
      "Non-nullable columns must not have `NA`s.",
      x = paste0("Columns to review: ", cols)
    ))
  }
  invisible(data)
}

select_incomplete <- function(data) {
  missing <- purrr::keep(data, function(x) any(is.na(x)))
  unlist(lapply(missing, anyNA))
}
#' Check if a named object contains expected names
#'
#' Based on fgeo.tool::check_crucial_names()
#'
#' @param x A named object.
#' @param expected_names String; expected names of `x`.
#'
#' @return Invisible `x`, or an error with informative message.
#'
#' Adapted from: https://github.com/RMI-PACTA/r2dii.match/blob/main/R/check_crucial_names.R
#'
#' @examples
#' x <- c(a = 1)
#' check_crucial_names(x, "a")
#' try(check_crucial_names(x, "bad"))
#' @export
#' @keywords internal
check_crucial_names <- function(x, expected_names) {
  stopifnot(rlang::is_named(x))
  stopifnot(is.character(expected_names))

  ok <- all(unique(expected_names) %in% names(x))
  if (!ok) {
    abort_missing_names(sort(setdiff(expected_names, names(x))))
  }

  invisible(x)
}

abort_missing_names <- function(missing_names) {
  rlang::abort(
    "missing_names",
    message = glue::glue(
      "Must have missing names:
      {paste0('`', missing_names, '`', collapse = ', ')}"
    )
  )
}
#' Demo loanbook entries
#'
#' A simplified demo tilt company data set to illustrate and test matching with
#' loanbook. For details on included cases please refer to data generation
#' script.
#'
#' @format A tibble with 11 rows and 5 variables:
#' \describe{
#' \item{id}{a numeric id}
#' \item{company_name}{name of company}
#' \item{postcode}{postcode of company}
#' \item{country}{country name in lowercase}
#' \item{misc_info}{A placeholder column that holds additional information that human matchers would consider in matching}
#' }
#' @examples
#' demo_loanbook
"demo_loanbook"
#' Demo matched db entries
#'
#' A simplified demo matched company data set to illustrate the manual matching process with
#' loanbook. For details on included cases please refer to data generation
#' script.
#'
#' @format A tibble with 9 rows and 5 variables:
#' \describe{
#' \item{id}{a numeric id}
#' \item{company_name}{name of company}
#' \item{postcode}{postcode of company}
#' \item{country}{country name in lowercase}
#' \item{misc_info}{A placeholder column that holds additional information that human matchers would consider in matching}
#' \item{company_alias}{name of the company, preprocessed}
#' \item{id_tilt}{a numeric id in the tilt db}
#' \item{company_name_tilt}{name of company}
#' \item{misc_info_tilt}{A placeholder column that holds additional information that human matchers would consider in matching}
#' \item{company_alias_tilt}{name of the company, preprocessed}
#' \item{string_sim}{string similarity between aliased company name in the loanbook and aliase company name in tilt db}
#' \item{suggest_match}{set to TRUE if string_sim is above a certain threshold}
#' \item{accept_match}{manual decision to whether the company from the loanbook matches a comapny in the tilt db}
#' }
#' @examples
#' demo_matched
"demo_matched"
#' Demo tilt db entries
#'
#' A simplified demo tilt company data set to illustrate and test matching with
#' loanbook. For details on included cases please refer to data generation
#' script.
#'
#' @format A tibble with 9 rows and 5 variables:
#' \describe{
#' \item{id}{a numeric id}
#' \item{company_name}{name of company}
#' \item{postcode}{postcode of company}
#' \item{country}{country name in lowercase}
#' \item{misc_info}{A placeholder column that holds additional information that human matchers would consider in matching}
#' }
#' @examples
#' demo_tilt
"demo_tilt"
#' @name demo_loanbook
#' @keywords internal
NULL
#' @name demo_tilt
#' @keywords internal
NULL
#' @name demo_matched
#' @keywords internal
NULL
#' Detect duplicated strings
#'
#' @inheritDotParams base::paste
#' @return  A logical vector of the same length as the longest vector passed to
#'   `...`.
#' @export
#' @examples
#' duplicated_paste(c("a", "a"), 1:2)
#' paste(c("a", "a"), 1:2)
#'
#' duplicated_paste(c("a", "a"), c(1, 1))
#' paste(c("a", "a"), c(1, 1))
#' @keywords internal
duplicated_paste <- function(...) {
  duplicated(paste(...))
}
#' Get the path to an example file
#'
#' @param file Name of the file.
#'
#' @return A path.
#' @export
#'
#' @examples
#' example_file("demo_loanbook.csv")
#'
#' example_file("demo_tilt.csv")
#'
#' example_file("demo_matched.csv")
example_file <- function(file) {
  system.file("extdata", file, package = "tilt.company.match", mustWork = TRUE)
}
#' Help read an example file
#' @examples
#' read_example("demo_loanbook.csv")
#' @noRd
read_example <- function(file) {
  vroom::vroom(example_file(file), show_col_types = FALSE)
}
#' Reports companies that were not matched in the loanbook
#'
#' @param loanbook Loanbook data set
#'
#' @param manually_matched Tibble holding the result of the matching process,
#'   after the user has manually selected and matched the companies in the
#'   loanbook with the tilt data set.
#'
#' @return `not_matched_companies` Tibble holding id and company name of the
#'   companies not matched by the tilt data set.
#'
#' @export
#' @examples
#' library(tibble)
#'
#' loanbook <- tibble(id = 1:2, company_name = letters[id], irrelevant = "xyz")
#' loanbook
#'
#' accepted <- tibble(id = 1:2, accept_match = c(TRUE, FALSE))
#' accepted
#'
#' report_no_matches(loanbook, accepted)
#'
#' # It's rigurous but fails with informative messages:
#' # The names of crucial columns must be as documented.
#' try(report_no_matches(loanbook, tibble(ids = 1, accept_match = TRUE)))
#'
#' # The type of `accept_match` must be as documented.
#' try(report_no_matches(loanbook, tibble(id = 1, accept_match = "TRUE")))
report_no_matches <- function(loanbook, manually_matched) {
  check_crucial_names(loanbook, c("id", "company_name"))
  check_crucial_names(manually_matched, c("id", "accept_match"))
  vctrs::vec_assert(manually_matched$accept_match, logical())

  # Filter first by all the manual successful matches in order to
  # suppress the duplicates caused by the string matching.
  matched <- dplyr::filter(manually_matched, .data$accept_match)

  # TODO: Simplify with `anti_join()`
  dplyr::left_join(loanbook, matched) %>%
    suppressMessages() |>
    dplyr::mutate(
      matched = dplyr::case_when(
        accept_match == TRUE ~ "Matched",
        is.na(accept_match) ~ "Not Matched",
        TRUE ~ "Not Matched"
      )
    ) |>
    dplyr::filter(matched == "Not Matched") %>%
    dplyr::distinct(.data$company_name, .data$id)
}
#' Suggest matching companies in a `loanbook` and `tilt` datasets
#'
#' This function suggests that a company in your `loanbook` is the same as a
#' company in the `tilt` dataset when the `similarity` between their names meets
#' all of these conditions:
#' * It's the highest among all other candidates.
#' * It's above the value set in the argument `suggestion_threshold`.
#' * It's the only such highest value in the group defined by a combination of
#' `company_name` x `postcode` -- to avoid duplicates.
#'
#' This function calculates the similarity between a standardized alias of the
#' `company_name` from the `loanbook` and `tilt` datasets. The standardized
#' alias makes real matches more likely by applying common best practices in
#' names matching. Complete similarity corresponds to `1`, and complete
#' dissimilarity corresponds to `0`.
#'
#' The columns `postcode` and `country` affect the quality of the matches and
#' the amount of manual-validation work ahead:
#' * If your `loanbook` has both `postcode` and `country` we match companies in
#' that specific `postcode` and that specific `country`. You will likely match
#' companies that are really the same (true positives) because it's unlikely
#' that two companies with similar name will be located close to each other.
#' This will cost you the minimum amount of manual-validation work ahead.
#' * If your `loanbook` lacks `postcode` but has `country` we match companies in
#' that specific `country` but across every `postcode`. You will possibly match
#' companies that are not really the same (false positives) but happen to have a
#' similar name and are located in the same `country`. This will cost you
#' additional manual-validation work ahead.
#' * If your `loanbook` has `postcode` but lacks `country` we match companies with
#' the same `postcode` but  across every `country`. You will possibly match
#' companies that are not really the same (false positives) but happen to have a
#' similar name and the same
#' postcode. This will cost you additional manual-validation work ahead.
#' * If your `loanbook` lacks both `postcode` and `country` we match companies
#' across the entire dataset.  You will most likely match companies that are not
#' really the same (false positives). This will cost you the greatest amount of
#' additional manual-validation work ahead.
#'
#' @param loanbook A `loanbook` dataframe like [demo_loanbook].
#' @param tilt A `tilt` dataframe like [demo_tilt].
#' @param eligibility_threshold Minimum value of `similarity` to keep a
#'   candidate match. Values under it are most likely false positives and thus
#'   dropped. This drastically reduce the number of candidates you'll need to
#'   validate manually. We believe this benefit outweighs the potential loss of
#'   a few true positives.
#' @param suggestion_threshold Value of `similarity` above which a match may be
#'   suggested.
#'
#' @return A dataframe with:
#'   * All the columns from the `loanbook` dataset.
#'   * All the columns from the `tilt` dataset butthe columns
#'   `id`, `company_name`, `postcode` and `country` all get the suffix "_tilt".
#'   * New columns:
#'       * `company_alias`
#'       * `company_alias_tilt`
#'       * `similarity`
#'       * `suggest_match`
#'       * `accept_match`.
#' For each company in the `loanbook` matching candidates are arranged by
#' descending `similarity`.
#'
#' @export
#' @examples
#' library(vroom)
#' loanbook <- vroom(example_file("demo_loanbook.csv"), show_col_types = FALSE)
#' tilt <- vroom(example_file("demo_tilt.csv"), show_col_types = FALSE)
#'
#' suggest_match(loanbook, tilt)
suggest_match <- function(loanbook,
                          tilt,
                          eligibility_threshold = 0.75,
                          suggestion_threshold = 0.9) {
  loanbook_alias <- loanbook %>% mutate(company_alias = to_alias(.data$company_name))
  # TODO: We can pre-compute this before we send the tilt dataset
  tilt_alias <- tilt %>% mutate(company_alias = to_alias(.data$company_name))

  # TODO: Ignore grouping if reading and matching line by line
  lacks_none <- loanbook_alias %>%
    filter(!is.na(.data$postcode) & !is.na(.data$country)) %>%
    left_join(
      tilt_alias,
      by = c("country", "postcode"),
      suffix = c("", "_tilt"),
      multiple = "all"
    ) |>
    suppressMessages()

  lacks_postcode <- loanbook_alias %>%
    filter(is.na(.data$postcode) & !is.na(.data$country)) %>%
    left_join(
      tilt_alias,
      by = c("country"),
      suffix = c("", "_tilt"),
      multiple = "all"
    ) %>%
    suppressMessages()

  lacks_country <- loanbook_alias %>%
    filter(!is.na(.data$postcode) & is.na(.data$country)) %>%
    left_join(tilt_alias, by = c("postcode"), suffix = c("", "_tilt")) %>%
    suppressMessages()

  lacks_both <- loanbook_alias %>%
    filter(is.na(.data$postcode) & is.na(.data$country)) %>%
    mutate(postcode = "join_helper") %>%
    inner_join(
      dplyr::mutate(tilt_alias, postcode = "join_helper"),
      by = c("postcode"),
      suffix = c("", "_tilt"),
      multiple = "all"
    ) %>%
    suppressMessages() %>%
    mutate(postcode = NA_character_)

  candidates <- bind_rows(lacks_none, lacks_postcode, lacks_country, lacks_both)

  okay_candidates <- candidates %>%
    # Other parameters may perform best. See `?stringdist::stringsim`
    mutate(similarity = stringsim(
      .data$company_alias, .data$company_alias_tilt,
      # Good to compare human typed text that might have typos.
      method = "jw",
      p = 0.1
    )) %>%
    # Arrange matching candidates from more to less similar
    arrange(id, -.data$similarity)

  best_candidates <- okay_candidates %>%
    filter(.data$similarity > eligibility_threshold | is.na(.data$similarity))
  # FIXME: Dead code?
  unmatched <- anti_join(
    okay_candidates %>% distinct(id, .data$company_name),
    best_candidates %>% distinct(id, .data$company_name)
  ) %>%
    suppressMessages()

  candidates_suggest_match <- best_candidates %>%
    # - It's the highest among all other candidates.
    group_by(id) %>%
    filter(.data$similarity == max(.data$similarity)) %>%
    # - It's above the threshold.
    filter(.data$similarity > suggestion_threshold) %>%
    # - It's the only such highest value in the group defined by a combination of
    # `company_name` x `postcode` -- to avoid duplicates.
    mutate(duplicates = any(duplicated_paste(.data$company_name, .data$postcode))) %>%
    filter(!.data$duplicates) %>%
    select("id", "id_tilt") %>%
    mutate(suggest_match = TRUE) %>%
    ungroup()

  to_edit <- best_candidates %>%
    left_join(candidates_suggest_match, by = c("id", "id_tilt")) %>%
    suppressMessages() %>%
    mutate(accept_match = NA)

  to_edit
}

#' Checks your `loanbook` is as we expect
#'
#' @param loanbook A `loanbook` dataframe like [demo_loanbook].
#'
#' @return Called for it's side effects. Returns `loanbook` invisibly.
#' @export
#'
#' @examples
#' library(vroom)
#' library(dplyr, warn.conflicts = FALSE)
#'
#' loanbook <- vroom(example_file("demo_loanbook.csv"), show_col_types = FALSE)
#' check_loanbook(loanbook)
#'
#' # Do you have the expected columns?
#' bad_name <- rename(loanbook, ids = id)
#' try(check_loanbook(bad_name))
#'
#' # Do you have any duplicates in the column `id`?
#' bad_id <- bind_rows(loanbook, slice(loanbook, 1))
#' try(check_loanbook(bad_id))
#'
#' # Do you have missing values (`NA`s) in non-nullable columns?
#' # styler: off
#' missing_id <- tribble(
#'   ~id,            ~company_name, ~postcode,  ~country, ~misc_info,
#'    NA, "John Meier's Groceries",   "55555", "germany",        "Y",
#'    11, "John Meier's Groceries",   "55555",  "norway",        "Y"
#' )
#' # styler: on
#' try(check_loanbook(missing_id))
check_loanbook <- function(loanbook) {
  expected <- c("id", "company_name", "postcode", "country")
  loanbook %>% check_crucial_names(expected)

  has_no_duplicates <- identical(anyDuplicated(loanbook$id), 0L)
  stopifnot(has_no_duplicates)

  best_without_duplicates <- c("company_name", "postcode", "country")
  report_duplicates(loanbook, best_without_duplicates)

  non_nullable <- c("id", "company_name")
  loanbook %>% abort_if_incomplete(non_nullable)

  invisible(loanbook)
}
#' @keywords internal
"_PACKAGE"

# TODO: Import each function individually (hack to keep R CMD check happy)
#' @import dplyr

## usethis namespace: start
#' @importFrom stringdist stringsim
#' @importFrom tibble tibble
#' @importFrom vroom vroom
## usethis namespace: end
NULL
#' Assign an additional name to an entity
#'
#' * `to_alias()` takes any character vector and creates an alias by
#' transforming the input (a) to lower case; (b) to latin-ascii characters; and
#' (c) to standard abbreviations of ownership types. Commonly, the inputs are
#' values from the columns `name_direct_loantaker` or `name_ultimate_parent`
#' of a loanbook dataset, or from the column `name_company` of an asset-level
#' dataset.
#' * `from_name_to_alias()` outputs a table giving default strings used to
#' convert from a name to its alias. You may amend this table and pass it to
#' `to_alias()` via the `from_to` argument.
#'
#' @section Assigning aliases:
#' The transformation process used to compare names between loanbook and tilt
#' datasets applies best practices commonly used in name matching algorithms:
#' * Remove special characters.
#' * Replace language specific characters.
#' * Abbreviate certain names to reduce their importance in the matching.
#' * Spell out numbers to increase their importance.
#'
#' @author person(given = "Evgeny", family = "Petrovsky", role = c("aut",
#'   "ctr"))
#'
#' Adapted from: https://github.com/RMI-PACTA/r2dii.match/blob/main/R/to_alias.R
#'
#' @source [r2dii.match](https://cran.r-project.org/package=r2dii.match) version 0.1.3.
#'
#' @param x Character string, commonly from the columns `name_direct_loantaker`
#'   or `name_ultimate_parent` of a loanbook dataset, or from the column
#'   `name_company` of an asset-level dataset.
#' @param from_to A data frame with replacement rules to be applied, contains
#'   columns `from` (for initial values) and `to` (for resulting values).
#' @param ownership vector of company ownership types to be distinguished for
#'   cut-off or separation.
#' @param remove_ownership Flag that defines whether ownership type (like llc)
#'   should be cut-off.
#'
#' @return
#' * `to_alias()` returns a character string.
#' * `from_name_to_alias()` returns a [tibble::tibble] with columns `from` and
#' `to`.
#'
#' @examples
#' library(dplyr)
#'
#' to_alias("A. and B")
#' to_alias("Acuity Brands Inc")
#' to_alias(c("3M Company", "Abbott Laboratories", "AbbVie Inc."))
#'
#' custom_replacement <- tibble(from = "AAAA", to = "B")
#' to_alias("Aa Aaaa", from_to = custom_replacement)
#'
#' neutral_replacement <- tibble(from = character(0), to = character(0))
#' to_alias("Company Name Owner", from_to = neutral_replacement)
#' to_alias(
#'   "Company Name Owner",
#'   from_to = neutral_replacement,
#'   ownership = "owner",
#'   remove_ownership = TRUE
#' )
#'
#' from_name_to_alias()
#'
#' append_replacements <- from_name_to_alias() %>%
#'   add_row(
#'     .before = 1,
#'     from = c("AA", "BB"), to = c("alpha", "beta")
#'   )
#' append_replacements
#'
#' # And in combination with `to_alias()`
#' to_alias(c("AA", "BB", "1"), from_to = append_replacements)
#' @export
#' @keywords internal
to_alias <- function(x,
                     from_to = NULL,
                     ownership = NULL,
                     remove_ownership = FALSE) {
  out <- x
  # base latin characters
  out <- stringi::stri_trans_general(out, "any-latin")
  out <- stringi::stri_trans_general(out, "latin-ascii")
  # lowercase
  out <- tolower(out)

  # symbols
  out <- purrr::reduce(get_sym_replace(), replace_abbrev, fixed = TRUE, .init = out)

  # only one space between words
  out <- gsub("[[:space:]]+", " ", out)

  out <- replace_with_abbreviation(from_to, .init = out)

  # trim redundant whitespaces
  out <- trimws(out, which = "both")

  # ?
  out <- gsub("(?<=\\s[a-z]{1}) (?=[a-z]{1})", "", out, perl = TRUE)

  out <- may_remove_ownership(remove_ownership, ownership, .init = out)

  # final adjustments
  out <- gsub("-", " ", out)
  out <- gsub("[[:space:]]", "", out)
  out <- gsub("[^[:alnum:][:space:]$]", "", out)
  out <- gsub("$", " ", out, fixed = TRUE)

  out
}

may_remove_ownership <- function(remove_ownership, ownership, .init) {
  ownership <- ownership %||% get_ownership_type()

  # ownership type distinguished (with $ sign) in company name
  paste_or_not <- function(x, remove_ownership) {
    if (remove_ownership) {
      c(paste0(" ", x, "$"), "")
    } else {
      c(paste0(" ", x, "$"), paste0("$", x))
    }
  }

  out <- purrr::map(ownership, ~ paste_or_not(.x, remove_ownership))
  purrr::reduce(out, replace_abbrev, .init = .init)
}

# Technology mix for analysis
get_ownership_type <- function() {
  c(
    "ab",
    "ag",
    "as",
    "asa",
    "bhd",
    "bsc",
    "bv",
    "co",
    "corp",
    "cv",
    "dac",
    "gmbh",
    "govt",
    "hldgs",
    "inc",
    "intl",
    "jsc",
    "llc",
    "lp",
    "ltd",
    "nv",
    "pcl",
    "pjsc",
    "plc",
    "pt",
    "pte",
    "sa",
    "sarl",
    "sas",
    "se",
    "spa",
    "spzoo",
    "srl"
  )
}

# replace each lhs with rhs
get_sym_replace <- function() {
  list(
    c(".", " "),
    c(",", " "),
    c("_", " "),
    c("/", " "),
    c("$", "")
  )
}

#' From name to alias
#'
#' Function that outputs a table giving default strings used to
#' convert from a name to its alias. You may amend this table and pass it to
#' `to_alias()` via the `from_to` argument.
#'
#' Source: @jdhoffa https://github.com/RMI-PACTA/r2dii.dataraw/pull/8
#'
#' @return [tibble::tibble] with columns `from` and
#' `to`.
#' @export
#' @keywords internal
from_name_to_alias <- function() {
  # styler: off
  tibble::tribble(
    ~from,               ~to,
    " and ",             " & ",
    " en ",             " & ",
    " och ",             " & ",
    " und ",             " & ",
    "(pjsc)",                "",
    "(pte)",                "",
    "(pvt)",                "",
    "0",            "null",
    "1",             "one",
    "2",             "two",
    "3",           "three",
    "4",            "four",
    "5",            "five",
    "6",             "six",
    "7",           "seven",
    "8",           "eight",
    "9",            "nine",
    "aktg",              "ag",
    "aktiengesellschaft", "ag",
    "aktien gesellschaft", "ag",
    "aktien-gesellschaft", "ag",
    "associate",           "assoc",
    "associates",           "assoc",
    "berhad",             "bhd",
    "company",              "co",
    "compagnie",            "co",
    "corporation",            "corp",
    "designated activity company",             "dac",
    "development",             "dev",
    "eingetragene genossenschaft", "eg",
    "eingetragener kaufmann", "ek",
    "eingetragener verein", "ev",
    "einzelunternehmen", "eu",
    "finance",            "fine",
    "financial",            "fina",
    "financial",             "fin",
    "financing",            "fing",
    "generation",             "gen",
    "generation",             "gen",
    "gesellschaft buergerliches rechts", "gbr",
    "gesellschaft burgerliches rechts", "gbr",
    "gesellschaft mit beschrankter haftung", "gmbh",
    "gesellschaft mit beschr haftg", "gmbh",
    "gesellschaft m b h", "gmbh",
    "gesellschaft mbh", "gmbh",
    "ges mit beschrankter haftung", "gmbh",
    "ges mit beschr haftg", "gmbh",
    "g mit beschr haftg", "gmbh",
    "ges m b h", "gmbh",
    "ges mb h", "gmbh",
    "ges m bh", "gmbh",
    "ges mbh",            "gmbh",
    "gesmbh", "gmbh",
    "golden",             "gld",
    "government",            "govt",
    "groep",             "grp",
    "group",             "grp",
    "holding",           "hldgs",
    "holdings",           "hldgs",
    "incorporated",             "inc",
    "international",            "intl",
    "investment",          "invest",
    "investment",          "invest",
    "kommanditgesellschaft", "kg",
    "kommanditges", "kg",
    "komm ges", "kg",
    "kommanditgesellschaft auf aktien", "kgaa",
    "kommanditgesellschaft aa", "kgaa",
    "kommanditges auf aktien", "kgaa",
    "kommanditges aa", "kgaa",
    "kg auf aktien", "kgaa",
    "limited",             "ltd",
    "limited partnership",              "lp",
    "ltd liability co",             "llc",
    "offene handelsgesellschaft", "ohg",
    "ograniczona odpowiedzialnoscia",              "oo",
    "partner",             "prt",
    "partners",             "prt",
    "public co ltd",             "pcl",
    "public ltd co",             "plc",
    "resource",             "res",
    "resources",             "res",
    "san tic anonim sti", "santicanonimsti",
    "san tic ltd sti",    "santicltdsti",
    "sanayi",             "san",
    "sanayi ve ticaret",  "sanayi ticaret",
    "shipping",             "shp",
    "sirketi",             "sti",
    "societas europaea", "se",
    "sp z o o",           "spzoo",
    "sp z oo",           "spzoo",
    "spolka z ",           "sp z ",
    "ticaret",             "tic",
    "ug (haftungsbeschrankt)", "ug",
    "unternehmergesellschaft (haftungsbeschrankt)", "ug",
    "unternehmergesellschaft", "ug"
  )
  # styler: on
}

`%||%` <- function(x, y) {
  if (is.null(x)) {
    y
  } else {
    x
  }
}

replace_with_abbreviation <- function(replacement, .init) {
  replacement <- replacement %||% from_name_to_alias()
  replacement <- purrr::set_names(replacement, tolower)

  check_crucial_names(replacement, c("from", "to"))

  abbrev <- purrr::map2(tolower(replacement$from), tolower(replacement$to), c)
  purrr::reduce(abbrev, replace_abbrev, fixed = TRUE, .init = .init)
}

# replace long words with abbreviations
replace_abbrev <- function(text, abr, fixed = FALSE) {
  gsub(abr[1], abr[2], text, fixed = fixed)
}
#' Pipe operator
#'
#' See \code{magrittr::\link[magrittr:pipe]{\%>\%}} for details.
#'
#' @name %>%
#' @rdname pipe
#' @keywords internal
#' @export
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
#' @param lhs A value or the magrittr placeholder.
#' @param rhs A function call using the magrittr semantics.
#' @return The result of calling `rhs(lhs)`.
NULL
#' Help construct test data like the `loanbook` and `tilt` datasets
#'
#' @param id,company_name,country,postcode Minimum columns.
#' @inheritDotParams tibble::tibble
#'
#' @return Tibble
#' @examples
#' toy()
#' toy(id = 1:2)
#' toy(id = NULL)
#' toy(new = "xyz")
#' @noRd
toy <- function(id = 1, company_name = "a", country = "b", postcode = "c", ...) {
  tibble(
    id = id,
    company_name = company_name,
    country = country,
    postcode = postcode,
    ...
  )
}
#' Report duplicate rows
#'
#' Reports duplicates in `data` on columns `cols`. More specifically, we are
#' interested in this case on the `company_name`, `postcode` and `country`
#' columns. Duplicates are reported via a warning.
#'
#' @param data Tibble holding a result data set.
#' @param cols Vector of columns names on which we want to test if there are
#' duplicates on.
#'
#' @return NULL
#' @export
#' @keywords internal
report_duplicates <- function(data, cols) {
  duplicates <- data %>%
    dplyr::group_by(!!!rlang::syms(cols)) %>%
    dplyr::filter(dplyr::n() > 1) %>%
    dplyr::select(!!!rlang::syms(cols)) %>%
    dplyr::distinct_all()

  if (nrow(duplicates) > 0) {
    rlang::inform(
      c(
        paste0("Found duplicate(s) on columns ", paste(cols, collapse = ", "), " of the data set."),
        x = duplicates %>% glue::glue_data("Found for the company {company_name}, postcode: {postcode}, country: {country}"),
        i = "Please check if these duplicates are intended and have an unique id."
      )
    )
  }

  return(invisible())
}

#' Reports duplicates from manual matching outcome
#'
#' Function throws a descriptive error if a company from the loanbook is
#' matched to > 1 company in the tilt db or reverse.
#'
#'
#' @param manually_matched Tibble holding the result of the matching process,
#'   after the user has manually verified and matched the results
#'
#' @return Input `manually_matched`
#' @importFrom rlang .data
#' @export
check_duplicated_relation <- function(manually_matched) {
  suggested_matches <- manually_matched %>%
    dplyr::filter(.data$accept_match)

  duplicates_in_loanbook <- suggested_matches %>%
    dplyr::group_by(.data$id, .data$company_name) %>%
    dplyr::mutate(nrow = dplyr::n()) %>%
    dplyr::filter(nrow > 1)

  if (nrow(duplicates_in_loanbook) > 0) {
    duplicated_companies <- duplicates_in_loanbook %>%
      dplyr::distinct(.data$id, .data$company_name)

    rlang::abort(
      c(
        "Duplicated match of company in loanbook detected.",
        x = duplicated_companies %>% glue::glue_data("Duplicated company name: {company_name}, id: {id}."),
        i = c(
          "Company names where `accept_match` is `TRUE` must be unique by `id`.",
          "Have you ensured that only one tilt-id per loanbook-id is set to `TRUE`?"
        )
      )
    )
  }

  duplicates_in_tilt <- suggested_matches %>%
    dplyr::group_by(.data$id_tilt, .data$company_name_tilt) %>%
    dplyr::mutate(nrow = dplyr::n()) %>%
    dplyr::filter(nrow > 1)

  if (nrow(duplicates_in_tilt) > 0) {
    duplicated_companies <- duplicates_in_tilt %>%
      dplyr::distinct(.data$id_tilt)

    rlang::abort(
      c(
        "Duplicated match of company from tilt db detected.",
        x = duplicated_companies %>% glue::glue_data("Duplicated tilt company name: {company_name_tilt}, tilt id: {id_tilt}."),
        i = c(
          "Have you ensured that each tilt-id is set to `TRUE` for maximum 1 company from the loanbook?"
        )
      )
    )
  }

  rlang::inform(message = "No duplicated matches found in the data.")

  return(invisible(manually_matched))
}

#' Render a .Rmd file into a .md file under tests/testthat/demos
#'
#' One use case of this  function is when you are working on a PR and want to
#' share the output of an .Rmd file. Once done would delete the .md file and
#' merge the PR.
#' @noRd
#' @examples
#' render_demo("vignettes/articles/tilt-company-match.Rmd")
render_demo <- function(path) {
  md <- path |>
    fs::path_file() |>
    fs::path_ext_remove() |>
    fs::path_ext_set(".md")

  parent <- fs::dir_create(here::here(testthat::test_path("demos")))
  rmarkdown::render(
    path,
    "md_document",
    output_file = fs::path(parent, md)
  )

  invisible(path)
}