#!/usr/bin/python3

"""
GIP provider for htcondor-CE.
"""

# we cannot change the name of the script.
# pylint: disable=invalid-name


import re
import sys
from datetime import datetime
import subprocess
from collections import defaultdict, namedtuple
import signal
import htcondor2 as htcondor
import classad2 as ca

SERVICE_LDIF = """dn: GLUE2ServiceID={central_manager},GLUE2GroupID=resource,o=glue
GLUE2ServiceID: {central_manager}
objectClass: GLUE2Entity
objectClass: GLUE2Service
objectClass: GLUE2ComputingService
GLUE2EntityName: Computing Service {central_manager}
GLUE2ServiceCapability: executionmanagement.jobexecution
GLUE2ServiceType: org.opensciencegrid.htcondorce
GLUE2ServiceQualityLevel: production
GLUE2ServiceComplexity: endpointType={num_endpoints}, share={num_shares}, resource=1
GLUE2ServiceAdminDomainForeignKey: {site_name}
"""


MANAGER_LDIF = """dn: GLUE2ManagerID={central_manager}_Manager,GLUE2ServiceID={central_manager},GLUE2GroupID=resource,o=glue
objectClass: GLUE2Entity
objectClass: GLUE2Manager
objectClass: GLUE2ComputingManager
GLUE2ManagerID: {central_manager}_Manager
GLUE2ManagerProductName: HTCondor
GLUE2ManagerProductVersion: {version}
GLUE2ComputingManagerTotalLogicalCPUs: {total_cores}
GLUE2ManagerServiceForeignKey: {central_manager}
GLUE2ComputingManagerComputingServiceForeignKey: {central_manager}
"""


RESOURCE_LDIF = """dn: GLUE2ResourceID={central_manager}_{arch}_{os}_{name}_{version}_{cpu}_{memory},GLUE2ServiceID={central_manager},GLUE2GroupID=resource,o=glue
objectClass: GLUE2Entity
objectClass: GLUE2Resource
objectClass: GLUE2ExecutionEnvironment
GLUE2ResourceID: {central_manager}_{arch}_{os}_{name}_{version}_{cpu}_{memory}
GLUE2ExecutionEnvironmentMainMemorySize: {memory}
GLUE2ExecutionEnvironmentVirtualMemorySize: {memory}
GLUE2ExecutionEnvironmentOSFamily: {os}
GLUE2ExecutionEnvironmentOSName: {name}
GLUE2ExecutionEnvironmentOSVersion: {version}
GLUE2ExecutionEnvironmentCPUMultiplicity: singlecpu-multicore
GLUE2ExecutionEnvironmentPlatform: {arch}
GLUE2ExecutionEnvironmentLogicalCPUs: {cpu}
GLUE2ExecutionEnvironmentConnectivityIn: TRUE
GLUE2ExecutionEnvironmentConnectivityOut: TRUE
GLUE2ExecutionEnvironmentTotalInstances: {instances}
GLUE2ResourceManagerForeignKey:  {central_manager}_Manager
GLUE2ExecutionEnvironmentComputingManagerForeignKey:  {central_manager}_Manager
"""


ENDPOINT_LDIF = """dn: GLUE2EndpointID={name}_HTCondorCE,GLUE2ServiceID={central_manager},GLUE2GroupID=resource,o=glue
objectClass: GLUE2Entity
objectClass: GLUE2Endpoint
objectClass: GLUE2ComputingEndpoint
GLUE2EndpointID: {name}_HTCondorCE
GLUE2EndpointCapability: executionmanagement.jobexecution
GLUE2EndpointInterfaceName: org.opensciencegrid.htcondorce
GLUE2EndpointImplementor: HTCondor
GLUE2EndpointImplementationName: HTCondor
GLUE2EndpointImplementationVersion: {version}
GLUE2EndpointURL: condor://{name}:9619
GLUE2EndpointQualityLevel: production
GLUE2EndpointServingState: production
GLUE2EndpointHealthState: {state}
GLUE2EndpointHealthStateInfo: {state_info}
GLUE2EndpointStartTime: {start_time}
GLUE2EndpointIssuerCA: {issuer}
GLUE2EndpointDowntimeInfo: See the GOC DB for downtimes: https://goc.egi.eu/
GLUE2EndpointServiceForeignKey: {central_manager}
GLUE2ComputingEndpointComputingServiceForeignKey: {central_manager}
"""

SPEC_LDIF = """dn: GLUE2BenchmarkID={central_manager}_{spec_type},GLUE2ResourceID={central_manager}_{resource},GLUE2ServiceID={central_manager},GLUE2GroupID=resource,o=glue
GLUE2BenchmarkExecutionEnvironmentForeignKey: {central_manager}_{resource}
GLUE2BenchmarkID: {central_manager}_{spec_type}
GLUE2BenchmarkType: {spec_type}
objectClass: GLUE2Entity
objectClass: GLUE2Benchmark
GLUE2BenchmarkValue: {spec_value}
GLUE2BenchmarkComputingManagerForeignKey: {central_manager}_Manager
GLUE2EntityName: Benchmark {spec_type}
"""

SHARE_LDIF = """dn: GLUE2ShareID={ce_fqdn}_{voname}_share,GLUE2ServiceID={central_manager},GLUE2GroupID=resource,o=glue
objectClass: GLUE2Entity
objectClass: GLUE2Share
objectClass: GLUE2ComputingShare
GLUE2ShareID: {ce_fqdn}_{voname}_share
GLUE2ComputingShareServingState: production
GLUE2ComputingShareTotalJobs: {total_vo_jobs}
GLUE2ComputingShareWaitingJobs: {idle_vo_jobs}
GLUE2ComputingShareRunningJobs: {running_vo_jobs}
GLUE2ComputingShareComputingServiceForeignKey: {central_manager}
GLUE2ComputingShareComputingEndpointForeignKey: {ce_fqdn}_HTCondorCE
GLUE2ShareServiceForeignKey: {central_manager}
GLUE2ShareEndpointForeignKey: {ce_fqdn}_HTCondorCE
{resource_keys}
"""

RESOURCE_KEY = """GLUE2ComputingShareExecutionEnvironmentForeignKey: {central_manager}_{resource}
GLUE2ShareResourceForeignKey: {central_manager}_{resource}
"""

POLICY_LDIF = """dn: GLUE2PolicyID={ce_fqdn}_{voname}_policy,GLUE2ShareID={ce_fqdn}_{voname}_share,GLUE2ServiceID={central_manager},GLUE2GroupID=resource,o=glue
objectClass: GLUE2Entity
objectClass: GLUE2Policy
objectClass: GLUE2MappingPolicy
GLUE2PolicyID: {ce_fqdn}_{voname}_policy
GLUE2PolicyScheme: org.glite.standard
GLUE2PolicyRule: vo:{voname}
GLUE2MappingPolicyShareForeignKey: {ce_fqdn}_{voname}_share

dn: GLUE2PolicyID={ce_fqdn}_{voname}_policy_access,GLUE2ShareID={ce_fqdn}_{voname}_share,GLUE2ServiceID={central_manager},GLUE2GroupID=resource,o=glue
objectClass: GLUE2Entity
objectClass: GLUE2Policy
objectClass: GLUE2AccessPolicy
GLUE2PolicyID: {ce_fqdn}_{voname}_policy_access
GLUE2PolicyScheme: org.glite.standard
GLUE2PolicyRule: vo:{voname}
GLUE2AccessPolicyEndpointForeignKey: {ce_fqdn}_HTCondorCE
"""


class TimeoutError(Exception):
    """
    Dummy timeout exception class.
    """
    pass


# pylint: disable=unused-argument
def handler(signum, frame):
    """
    Handler for timeout signal.
    """
    raise TimeoutError("TimeoutError")

def format_resource_name(machine_ad):
    return '_'.join([machine_ad.Arch,
                     machine_ad.OpSys,
                     machine_ad.OpSysName,
                     machine_ad.OpSysMajorVer,
                     machine_ad.DetectedCpus,
                     machine_ad.DetectedMemory])


def format_resource_entry(central_manager, machine_ad, instances):
    return RESOURCE_LDIF.format(central_manager=central_manager,
                                arch=machine_ad.Arch,
                                os=machine_ad.OpSys,
                                name=machine_ad.OpSysName,
                                version=machine_ad.OpSysMajorVer,
                                memory=machine_ad.DetectedMemory,
                                cpu=machine_ad.DetectedCpus,
                                instances=instances)

def format_spec_entry(central_manager, machine_ad, specs):
    ldifs = []

    resource = format_resource_name(machine_ad)

    for spec_type, spec_value in specs.items():
        spec_type = spec_type.strip().replace('_', '-')
        ldifs.append(SPEC_LDIF.format(central_manager=central_manager,
                                      resource=resource,
                                      spec_type=spec_type,
                                      spec_value=spec_value))

    return '\n'.join(ldifs)


def format_manager_entry(central_manager, version, total_cores):
    return MANAGER_LDIF.format(central_manager=central_manager,
                               version=version,
                               total_cores=total_cores)


def format_service_entry(central_manager, endpoints, vonames, site_name):
    # Print the entry for the GLUE2 Service
    return SERVICE_LDIF.format(central_manager=central_manager,
                               num_endpoints=len(endpoints),
                               num_shares=len(vonames),
                               site_name=site_name)


def format_endpoint_entry(central_manager, ce_schedd_ad, time_out):
    name = ce_schedd_ad['Name']
    version = ce_schedd_ad['CondorVersion'].split()[1]
    state, state_info = find_ce_state(ce_schedd_ad, time_out)
    issuer = get_cert_issuer('/etc/grid-security/hostcert.pem')

    start_time = datetime.fromtimestamp(int(ce_schedd_ad['DaemonStartTime'])).strftime('%Y-%m-%dT%H:%M:%SZ')

    return ENDPOINT_LDIF.format(name=name,
                                version=version,
                                state=state,
                                state_info=state_info,
                                start_time=start_time,
                                issuer=issuer,
                                central_manager=central_manager)


def format_share_entry(central_manager, ce_fqdn, voname, topologies, total_jobs, idle_jobs, running_jobs):
    resource_keys = ""
    for machine_ad in topologies:
        resource = format_resource_name(machine_ad)
        resource_keys += RESOURCE_KEY.format(central_manager=central_manager, resource=resource)

    return SHARE_LDIF.format(ce_fqdn=ce_fqdn,
                             voname=voname,
                             total_vo_jobs=total_jobs,
                             idle_vo_jobs=idle_jobs,
                             running_vo_jobs=running_jobs,
                             central_manager=central_manager,
                             resource_keys=resource_keys)


def format_policy_entry(central_manager, ce_fqdn, voname):
    return POLICY_LDIF.format(central_manager=central_manager,
                              ce_fqdn=ce_fqdn,
                              voname=voname)


def get_ce_schedd_ad(fqdn, port=9619):
    # find the CE using the default CE port
    ce_host = '%s:%s' % (fqdn, port)
    ce_collector = htcondor.Collector(ce_host)
    return ce_collector.query(htcondor.AdTypes.Schedd, 'Name =?= "{0}"'.format(fqdn))[0]


def find_ce_state(ce_schedd_ad, time_out):
    signal.signal(signal.SIGALRM, handler)
    signal.alarm(time_out)
    try:
        if htcondor.SecMan().ping(ce_schedd_ad, "READ")['AuthorizationSucceeded']:
            state = 'ok'
        else:
            state = 'warning'
        state_info = 'Authorization ping successful'
    except (KeyError, RuntimeError):
        state = 'critical'
        state_info = 'Authorization ping failed'
    except TimeoutError:
        sys.stderr.write("Ping to CE schedd on %s timed out after %i s.\n"
                         % (ce_schedd_ad['Name'], time_out))
        raise

    signal.signal(signal.SIGALRM, signal.SIG_IGN)

    return state, state_info


def get_cert_issuer(path):
    cmd = ['/usr/bin/openssl', 'x509', '-noout', '-issuer', '-nameopt', 'RFC2253', '-in', path]
    cmd_proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return cmd_proc.communicate()[0].decode('latin-1', 'strict').replace('issuer=', '').strip()


def get_ce_jobs(ce_schedd_ad, time_out):
    ce_fqdn = ce_schedd_ad['Machine']
    try:
        ce_schedd = htcondor.Schedd(ce_schedd_ad)
        query = ce_schedd.query(projection=["JobStatus", "x509userproxyvoname"])
    except RuntimeError as exc:
        sys.stderr.write("%s: %s\n" % (exc, ce_fqdn))
        raise

    idle_vo_jobs = defaultdict(int)
    running_vo_jobs = defaultdict(int)
    total_vo_jobs = defaultdict(int)

    signal.signal(signal.SIGALRM, handler)
    signal.alarm(time_out)
    try:
        for job in query:
            if not job.get("JobStatus") or not job.get("x509userproxyvoname"):
                continue
            total_vo_jobs[job['x509userproxyvoname']] += 1
            if job['JobStatus'] == 1:
                idle_vo_jobs[job['x509userproxyvoname']] += 1
            elif job['JobStatus'] == 2:
                running_vo_jobs[job['x509userproxyvoname']] += 1
    except TimeoutError:
        sys.stderr.write("CE schedd on %s timed out after %i s.\n" % ce_fqdn, time_out)
        raise

    signal.signal(signal.SIGALRM, signal.SIG_IGN)

    return total_vo_jobs, idle_vo_jobs, running_vo_jobs


def main():
    """
    Main provider routine.
    """

    # Get hostname of the batch system central manager
    central_manager = re.split(r'[\s,]+', htcondor.param.get('COLLECTOR_HOST'))[0]


    # Get VO Names
    vonames = htcondor.param.get('HTCONDORCE_VONames')
    if not vonames:
        sys.stderr.write("Error: HTCONDORCE_VONames not set\n")
        sys.exit(1)
    vonames = re.split(r'[\s,]+', vonames)


    # Get Site Name
    site_name = htcondor.param.get('HTCONDORCE_SiteName')
    if not site_name:
        sys.stderr.write("Error: HTCONDORCE_SiteName: not set\n")
        sys.exit(1)


    # Get the timeout value
    time_out = int(htcondor.param.get('GLUE_PROVIDER_TIMEOUT', 10))

    # Query collector for the number of CPUs and batch system Collector ad
    coll = htcondor.Collector()
    total_cores = {}
    topologies = defaultdict(int)
    machine_attrs = ['Arch', 'OpSys', 'OpSysMajorVer', 'OpSysName', 'DetectedCpus', 'DetectedMemory']
    machine_tuple = namedtuple('machine_tuple', machine_attrs)

    for startd in coll.query(htcondor.AdTypes.Startd, 'State=!="Owner"', machine_attrs + ['Machine']):
        try:
            # Format the attributes for printing
            resource_class = machine_tuple(Arch=startd['Arch'].lower(),
                                           OpSys=startd['OpSys'].lower(),
                                           OpSysMajorVer=str(startd['OpSysMajorVer']),
                                           OpSysName=startd['OpSysName'],
                                           DetectedCpus=str(startd['DetectedCpus']),
                                           DetectedMemory=str(startd['DetectedMemory']))

            if startd['Machine'] not in total_cores:
                total_cores[startd['Machine']] = startd['DetectedCpus']
        except KeyError as exc:
            msg = "Malformed machine ad: Missing '{0}' attribute for {1}"\
                   .format(exc, startd['Machine'])
            sys.stderr.write(msg)

        topologies[resource_class] += 1

    # Get from the configuration types and values of the benchmarks
    specs = ca.ClassAd(htcondor.param.get('HTCONDORCE_SPEC', "[]"))

    # Print the entry for the GLUE2 Resource
    for resource, instances in topologies.items():
        print(format_resource_entry(central_manager, resource, instances))
        print(format_spec_entry(central_manager, resource, specs))

    coll_ad = coll.query(htcondor.AdTypes.Collector)[0]  # the pool collector ad
    version = coll_ad['CondorVersion'].split()[1]

    print(format_manager_entry(central_manager, version, sum(total_cores.values())))

    ce_batch_schedd_ads = coll.query(
        htcondor.AdTypes.Schedd,
        'HAS_HTCONDOR_CE =?= True',
        ['Machine']
    )

    print(format_service_entry(central_manager, ce_batch_schedd_ads, vonames, site_name))

    for ce_batch_schedd_ad in ce_batch_schedd_ads:
        ce_fqdn = ce_batch_schedd_ad['Machine']
        try:
            ce_schedd_ad = get_ce_schedd_ad(ce_fqdn)
        except (RuntimeError, IndexError):
            sys.stderr.write("Unable to locate CE schedd on %s\n" % ce_fqdn)
            continue
        except EnvironmentError:
            continue
            sys.stderr.write("Failed communication with CE collector on %s\n" % ce_fqdn)

        try:
            print(format_endpoint_entry(central_manager, ce_schedd_ad, time_out))
        except TimeoutError:
            continue

        try:
            total_vo_jobs, idle_vo_jobs, running_vo_jobs = get_ce_jobs(ce_schedd_ad, time_out)
        except TimeoutError:
            continue
        except RuntimeError:
            continue

        for voname in vonames:
            total_jobs = total_vo_jobs.get(voname, 0)
            idle_jobs = idle_vo_jobs.get(voname, 0)
            running_jobs = running_vo_jobs.get(voname, 0)

            print(format_share_entry(central_manager,
                                     ce_fqdn,
                                     voname,
                                     topologies,
                                     total_jobs,
                                     idle_jobs,
                                     running_jobs))

            print(format_policy_entry(central_manager, ce_fqdn, voname))


if __name__ == '__main__':
    main()
