use std::path::PathBuf;

use anyhow::{Context, Ok, Result};
use quote::quote;

mod paths {
    pub const DERIVED_CORE_PROPERTIES: &str = "target/DerivedCoreProperties.txt";
    pub const TABLES: &str = "crates/biome_unicode_table/src/tables.rs";
}

pub fn generate_tables() -> Result<()> {
    let properties = Properties::cached_or_fetch()?;

    let properties = ["ID_Continue", "ID_Start"]
        .iter()
        .map(|property| {
            let ranges = properties
                .extract(property)?
                .into_iter()
                .map(|(start, end)| quote! { (#start, #end) })
                .collect::<Vec<_>>();

            let fn_ident = quote::format_ident!("{}", property);
            let table_ident = quote::format_ident!("{}_table", property);

            Ok(quote! {
                pub const #table_ident: &[(char, char)] = &[
                    #(#ranges),*
                ];

                pub fn #fn_ident(c: char) -> bool { super::bsearch_range_table(c, #table_ident) }
            })
        })
        .collect::<Result<Vec<_>, _>>()?;

    let tokens = quote! {
        //! Autogenerated file, do not edit by hand.
        //! Run `cargo codegen unicode` and recommit this file when Unicode support has changed.
        #![allow(missing_docs, non_upper_case_globals, non_snake_case)]

        fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
            use core::cmp::Ordering::{Equal, Less, Greater};
            r.binary_search_by(|&(lo,hi)| {
                // Because ASCII ranges are at the start of the tables, a search for an
                // ASCII char will involve more `Greater` results (i.e. the `(lo,hi)`
                // table entry is greater than `c`) than `Less` results. And given that
                // ASCII chars are so common, it makes sense to favor them. Therefore,
                // the `Greater` case is tested for before the `Less` case.
                if lo > c { Greater }
                else if hi < c { Less }
                else { Equal }
            }).is_ok()
        }

        pub mod derived_property {
            #(#properties)*
        }
    };

    let pretty = xtask_glue::reformat(tokens)?;

    std::fs::write(xtask_glue::project_root().join(paths::TABLES), pretty)?;

    Ok(())
}

struct Properties {
    raw: String,
}

impl Properties {
    /// Retrieve properties from cache, or from the unicode website if cache is missing.
    /// # Errors
    /// Return an error if reading cache or fetching fresh data from the unicode website fails.
    pub fn cached_or_fetch() -> Result<Self> {
        Self::from_cache().or_else(|_| {
            let fetched = Self::fetch()?;
            fetched.save_cache()?;
            Ok(fetched)
        })
    }

    fn path() -> PathBuf {
        xtask_glue::project_root().join(paths::DERIVED_CORE_PROPERTIES)
    }

    /// Retrieve properties from the unicode website.
    /// # Errors
    /// Return an error if the HTTP request fails.
    fn fetch() -> Result<Self> {
        let raw = ureq::get("http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt")
            .call()?
            .into_body()
            .read_to_string()?;

        println!("Loaded properties from `unicode.org`");

        Ok(Self { raw })
    }

    /// Retrieve properties from cache, see [`Self::save_cache`].
    /// # Errors
    /// Return an error if the cache file couldn't be read.
    fn from_cache() -> Result<Self> {
        let path = Self::path();
        let raw = std::fs::read_to_string(&path)?;

        println!("Loaded properties from cache ({})", path.display());

        Ok(Self { raw })
    }

    /// Save the properties on disk for later usage, see [`Self::from_cache`].
    /// # Errors
    /// Return an error if saving to disk fails.
    fn save_cache(&self) -> Result<()> {
        let path = Self::path();
        std::fs::write(&path, &self.raw)?;

        println!("Saved properties to cache ({})", path.display());

        Ok(())
    }

    /// Extract code points matching the specified `property`.
    /// The returned `Vec` is sorted and sequential code points are collapsed.
    /// # Errors
    /// Return an error if parsing fails.
    pub fn extract(&self, property: &str) -> Result<Vec<(char, char)>> {
        fn parse_code_point(input: &str) -> Result<char> {
            char::from_u32(
                u32::from_str_radix(input, 16)
                    .with_context(|| format!("unable to parse `{}` as a code point", &input))?,
            )
            .ok_or_else(|| anyhow::anyhow!("invalid char `{}`", &input))
        }

        self.raw
            .lines()
            .filter_map(|line| {
                // Discard comments.
                let line = line.split('#').next()?;

                // https://www.unicode.org/reports/tr44/#Data_Fields
                let mut fields = line.split(';');

                // First field is a code point or range.
                let code_point_or_range = fields.next()?.trim();

                // Check if the remaining fields contains the property we are looking for.
                if fields.any(|field| field.trim() == property) {
                    Some(code_point_or_range)
                } else {
                    None
                }
            })
            .try_fold(
                Vec::<(char, char)>::new(),
                |mut buffer, code_point_or_range| {
                    let range @ (start, end) = match code_point_or_range.split_once("..") {
                        Some((start, end)) => (parse_code_point(start)?, parse_code_point(end)?),
                        None => {
                            let code_point = parse_code_point(code_point_or_range)?;
                            (code_point, code_point)
                        }
                    };

                    if let Some((_, previous_end)) = buffer.last_mut() {
                        assert!(*previous_end < start, "need sorted table for binary search");

                        // If the ranges are continuous, collapse them.
                        if (*previous_end as u32) + 1 == (start as u32) {
                            *previous_end = end;

                            return Ok(buffer);
                        }
                    }

                    buffer.push(range);

                    Ok(buffer)
                },
            )
    }
}
