# This script can be used to derive company-specific sector shares for companies
# that are active in two or more of the in-scope PACTA sectors. There are a
# number of ways how to calculate sector splits. One option is to calculate
# sector splits based on an equal weights approach, with an option
# to use primary energy inputs to calculate the shares for PACTA energy sectors:
# Coal, Oil & Gas, Power. Another option is to allocate the split entirely to
# the worst performing sector, based on the aggregate alignment metric on the
# company-sector level.
# The initial sector split is calculated as an equal weights split based on the
# number of in-scope sectors the company operates in. The energy-focused second
# step calculates energy sector splits based on a common unit of economic
# activity, million tons of oil equivalent (mtoe). A number of transformation
# steps must be made to arrive at this common unit for all three energy sectors.
# For the Oil & Gas and the Coal sectors, the transformation is rather straight
# forward, as both are already given in terms of primary energy. This means a
# simple conversion factor can be applied.
# For Power Generation, the steps are more complicated. Power generation is
# given in terms of MWh of electricity generated. This needs to be converted to
# mtoe using a conversion factor as in the other cases. Additionally, we need to
# consider that the process of generating electricity from a primary energy
# input leads to losses of primary energy in terms of heat in some technologies.
# The amount of primary energy lost to heat differs by technology and is
# generally relevant in power generation through heat or combustion, but not all
# other ways of generating electricity.
# We therefore have to divide the electricity generated by an efficiency factor
# specific to each technology to arrive at a primary energy equivalent for power
# generation.
# The output is a list of companies and sectors the companies operate in, for
# which a sector share ratio is provided, based on the two steps outlined above.
# The intended use case is to provide a rule by which to allocate loans to
# multiple sectors, for loan books where such loan allocation is ambiguous.

prepare_sector_split <- function(config) {
  config <- load_config(config)

  if (!get_apply_sector_split(config)) {
    cli::cli_warn("{.arg apply_sector_split} is {.val FALSE} in your config, so the {.fn prepare_sector_split} function will do nothing.")
    return(invisible())
  }

  # input/output paths for prepare_sector_split
  dir_prepared_abcd <- get_dir_prepared_abcd(config)
  path_sector_split <- get_path_sector_split(config)
  path_advanced_company_indicators <- get_path_advanced_company_indicators(config)
  sheet_advanced_company_indicators <- get_sheet_advanced_company_indicators(config)

  start_year <- get_start_year(config)
  time_frame <- get_time_frame(config)
  remove_inactive_companies <- get_remove_inactive_companies(config)

  ## load input data----
  advanced_company_indicators_raw <- readxl::read_xlsx(
    path = path_advanced_company_indicators,
    sheet = sheet_advanced_company_indicators
  )

  company_ids_primary_energy_split <- readr::read_csv(
    path_sector_split,
    col_types = readr::cols_only(company_id = "d"),
    col_select = "company_id"
  ) %>%
    dplyr::pull(.data[["company_id"]])

  # optional: remove inactive companies
  if (remove_inactive_companies) {
    abcd_removed_inactive_companies <- readr::read_csv(
      file.path(dir_prepared_abcd, "abcd_removed_inactive_companies.csv"),
      col_select = dplyr::all_of(cols_abcd),
      col_types = col_types_abcd_final
    )
  }

  # validate input data----
  validate_input_sector_split(
    data = advanced_company_indicators_raw,
    start_year = start_year
  )

  # calculate sector split----
  ## wrangle input data----
  advanced_company_indicators <- advanced_company_indicators_raw %>%
    dplyr::rename_with(.fn = ~ gsub(" ", "_", tolower(.x))) %>%
    # to compare primary energy units, we need power generation, not power capacity
    dplyr::filter(
      (.data[["asset_sector"]] == "Power" & .data[["activity_unit"]] == "MWh") | .data[["asset_sector"]] != "Power"
    ) %>%
    dplyr::select(
      -dplyr::all_of(
        c(
          dplyr::starts_with("direct_ownership_"),
          dplyr::starts_with("financial_control_")
        )
      )
    ) %>%
    dplyr::rename_with(.fn = ~ gsub("asset_", "", .x)) %>%
    tidyr::pivot_longer(
      cols = dplyr::starts_with("equity_ownership_"),
      names_to = "year",
      names_prefix = "equity_ownership_",
      values_to = "value",
      values_ptypes = list("value" = numeric())
    ) %>%
    dplyr::mutate(year = as.numeric(.data[["year"]])) %>%
    dplyr::mutate(
      sector = tolower(.data[["sector"]]),
      sector = dplyr::case_when(
        .data[["sector"]] == "oil&gas" ~ "oil and gas",
        .data[["sector"]] == "ldv" ~ "automotive",
        TRUE ~ .data[["sector"]]
      ),
      technology = dplyr::case_when(
        .data[["sector"]] == "coal" ~ "coal",
        .data[["sector"]] == "oil and gas" & grepl("Gas", .data[["technology"]]) ~ "gas",
        .data[["sector"]] == "oil and gas" & grepl("Oil", .data[["technology"]]) ~ "oil",
        .data[["sector"]] == "power" ~ tolower(.data[["technology"]]),
        TRUE ~ tolower(.data[["technology"]])
      )
    ) %>%
    dplyr::filter(
      !.data[["sector"]] %in% c("hdv", "shipping"),
      !.data[["activity_unit"]] == "tkm"
    ) %>%
    dplyr::summarise(
      value = sum(.data[["value"]], na.rm = TRUE),
      .by = c(
        "company_id",
        "company_name",
        "sector",
        "technology",
        "activity_unit",
        "year"
      )
    ) %>%
    dplyr::rename(
      name_company = "company_name",
      production = "value",
      production_unit = "activity_unit"
    ) %>%
    # we calculate the sector split based on the primary energy mix of the start year
    dplyr::filter(.data[["year"]] == .env[["start_year"]])

  # optional: remove inactive companies
  if (remove_inactive_companies) {
    advanced_company_indicators <- advanced_company_indicators %>%
      dplyr::anti_join(abcd_removed_inactive_companies, by = c("company_id", "sector"))
  }

  ## determine sector splits by company----
  ### count number of sectors and energy sectors per company----
  n_sectors_by_company <- advanced_company_indicators %>%
    dplyr::mutate(
      energy_sector = dplyr::if_else(
        .data[["sector"]] %in% c("coal", "oil and gas", "power"), TRUE, FALSE
      )
    ) %>%
    dplyr::distinct(
      .data[["company_id"]],
      .data[["sector"]],
      .data[["energy_sector"]]
    ) %>%
    dplyr::mutate(
      n_sectors = dplyr::n(),
      .by = "company_id"
    ) %>%
    dplyr::summarise(
      n_energy_sectors = sum(.data[["energy_sector"]], na.rm = TRUE),
      .by = c("company_id", "n_sectors")
    )

  ### identify companies active in more than one energy sector----
  companies_in_multiple_energy_sectors <- n_sectors_by_company %>%
    dplyr::filter(.data[["n_energy_sectors"]] > 1) %>%
    dplyr::pull(.data[["company_id"]])

  ## calculate equal weights sector split for all sectors----
  # for each company add sector split by number of sectors the company operates in equally
  sector_split_all_companies <- advanced_company_indicators %>%
    dplyr::filter(
      .data[["year"]] == .env[["start_year"]]
    ) %>%
    dplyr::inner_join(
      n_sectors_by_company,
      by = "company_id"
    ) %>%
    dplyr::mutate(
      sector_split = 1 / .data[["n_sectors"]]
    ) %>%
    dplyr::summarise(
      production = sum(.data[["production"]], na.rm = TRUE),
      n_sectors = max(.data[["n_sectors"]], na.rm = TRUE),
      n_energy_sectors = max(.data[["n_energy_sectors"]], na.rm = TRUE),
      sector_split = max(.data[["sector_split"]], na.rm = TRUE),
      .by = c("company_id", "name_company", "sector", "year", "production_unit")
    )

  ### check that the sum of the sector split of each company is 1----
  assert_sector_split_is_one(sector_split_all_companies)

  ## calculate primary energy-based sector split for energy sectors----
  # keep only companies that are active in multiple energy sectors
  sector_split_multi_energy_companies <- advanced_company_indicators %>%
    dplyr::filter(
      .data[["company_id"]] %in% .env[["companies_in_multiple_energy_sectors"]],
      .data[["sector"]] %in% c("coal", "oil and gas", "power"),
      .data[["year"]] == .env[["start_year"]]
    )

  # adjust power generation by primary energy efficiency
  sector_split_multi_energy_companies_power <- sector_split_multi_energy_companies %>%
    dplyr::filter(.data[["sector"]] == "power") %>%
    dplyr::inner_join(
      primary_energy_efficiency,
      by = c("sector", "technology")
    ) %>%
    dplyr::mutate(
      production = .data[["production"]] / .data[["primary_energy_efficiency_factor"]]
    ) %>%
    dplyr::select(-"primary_energy_efficiency_factor")

  # transform all energy sectors to common unit of energy: mtoe
  sector_split_multi_energy_companies <- sector_split_multi_energy_companies %>%
    dplyr::filter(.data[["sector"]] != "power") %>%
    dplyr::bind_rows(sector_split_multi_energy_companies_power) %>%
    dplyr::summarise(
      production = sum(.data[["production"]], na.rm = TRUE),
      .by = c("company_id", "name_company", "sector", "year", "production_unit")
    ) %>%
    dplyr::inner_join(
      unit_conversion,
      by = c("sector", "production_unit" = "unit")
    ) %>%
    dplyr::mutate(
      production = .data[["production"]] * .data[["value_in_mtoe"]],
      production_unit = "mtoe"
    ) %>%
    dplyr::select(-"value_in_mtoe")

  # get the sector split for each multi energy sector company based on common energy units
  sector_split_multi_energy_companies <- sector_split_multi_energy_companies %>%
    dplyr::mutate(
      sector_split = .data[["production"]] / sum(.data[["production"]], na.rm = TRUE),
      .by = c(
        "company_id",
        "name_company",
        "year",
        "production_unit"
      )
    )

  # wrangle
  sector_split_multi_energy_companies <- sector_split_multi_energy_companies %>%
    dplyr::select(
      dplyr::all_of(
        c(
          "company_id",
          "name_company",
          "sector",
          "production_unit",
          "production",
          "sector_split"
        )
      )
    )

  # keep only companies that are provided in input company list
  sector_split_multi_energy_companies <- sector_split_multi_energy_companies %>%
    dplyr::filter(.data[["company_id"]] %in% .env[["company_ids_primary_energy_split"]])

  ### check that the sum of the primary energy based sector split of each company is 1----
  assert_sector_split_is_one(sector_split_multi_energy_companies)

  ## combine the sector splits----
  # we want to use the plain equal weights split for companies that do not operate in more than one energy sector
  # for companies that operate in more than one energy sector, we want to scale the primary energy based split to the equal weights share of these sectors in the total company operations
  # this means that if a multi energy sector company only operates in energy sectors it will retain the primary energy based sector split
  # if a company operates in multiple energy sectors and non-energy sectors, we want to scale the primary energy based split to the equal weights share of the energy sectors to ensure the exosure to non-energy sectors is not lost
  sector_split_all_companies_final <- sector_split_all_companies %>%
    dplyr::left_join(
      sector_split_multi_energy_companies,
      by = c("company_id", "name_company", "sector"),
      suffix = c("_all", "_energy")
    ) %>%
    dplyr::mutate(
      sector_split_energy_scaled = (.data[["n_energy_sectors"]] / .data[["n_sectors"]]) * .data[["sector_split_energy"]],
      sector_split = dplyr::if_else(
        is.na(.data[["sector_split_energy"]]),
        .data[["sector_split_all"]],
        .data[["sector_split_energy_scaled"]]
      )
    ) %>%
    dplyr::rename(
      production = "production_all",
      production_unit = "production_unit_all"
    )

  ### check that the sum of the combined sector split of each company is 1----
  assert_sector_split_is_one(sector_split_all_companies_final)

  ## write output----
  sector_split_multi_energy_companies %>%
    dplyr::select(
      dplyr::all_of(
        c(
          "company_id",
          "name_company",
          "sector",
          "sector_split"
        )
      )
    ) %>%
    readr::write_csv(
      file.path(dir_prepared_abcd, "companies_sector_split_primary_energy_only.csv"),
      na = ""
    )

  sector_split_all_companies %>%
    dplyr::select(
      dplyr::all_of(
        c(
          "company_id",
          "name_company",
          "sector",
          "sector_split"
        )
      )
    ) %>%
    readr::write_csv(
      file.path(dir_prepared_abcd, "companies_sector_split_equal_weights_only.csv"),
      na = ""
    )

  sector_split_all_companies_final %>%
    dplyr::select(
      dplyr::all_of(
        c(
          "company_id",
          "name_company",
          "sector",
          "sector_split"
        )
      )
    ) %>%
    readr::write_csv(
      file.path(dir_prepared_abcd, "companies_sector_split.csv"),
      na = ""
    )
}

validate_input_sector_split <- function(data,
                                        start_year) {
  # consistency check
  available_years <- dplyr::select(data, dplyr::starts_with("Equity Ownership"))
  available_years <- unique(as.numeric(gsub("Equity Ownership ", "", names(available_years))))

  if (!start_year %in% available_years) {
    cli::cli_abort(
      message = c(
        x = "required {.arg start_year} for sector split not found in {.arg data}",
        i = "You provided: {.arg start_year} = {start_year}",
        i = "Available values are: {available_years}",
        i = "Please ensure that your input data sets and parameter settings are consistent."
      )
    )
  }

  invisible()
}
