#! /usr/bin/python

## 20210613
## version 0.1.1dev

################################################################################
##
##  sbagit.py mimics bagit.py in particular the code archtecture
##
##  if just bagit.py functionality is needed then you should be be using just
##  bagit.py
##
##  sbagit.py making a bag adds optional arguments
##   --tar          serialize the bag structure as a tar container
##   --zip          serialize the bag structure as a zip container with minimal
##                  compression
##   --show-suffix  use '.tar' or '.zip' container name suffix
##                  (default: no suffix)
##
##  and --adjunct-file that specifies a non-payload file to be included also.
##  More than one adjunct file can be included.
##
##  a possible confusion is that sbagit.py takes a directory name
##  (like bagit.py) when 'making' but when 'validating' a serialized bag
##  structure takes a file name (unlike bagit.py)
##
##  sbagit.py supports bagit.py 'validate' and 'is_valid' features by
##  temporarily unpacking the container and then deferring to bagit.py
##
##  sbagit.py make_sbag returns a container property (either 'tar' or 'zip') in
##  addition to the bagit.py make_bag properties
##
##  sbagit.py also supports interrogating bag header components within the
##  container without unpacking the container
##
################################################################################

##  shutil.make_archive new in 3.2, 3.8 tar uses modern PAX changed in 3.8

import argparse
import bagit
import gettext
import hashlib
import json
import logging
import re
from pathlib import Path
from pkg_resources import DistributionNotFound, get_distribution
import shutil
import sys
import tarfile
from tempfile import TemporaryDirectory
import zipfile

MODULE_NAME = "sbagit" if __name__ == "__main__" else __name__

LOGGER = logging.getLogger(MODULE_NAME)

##  is there a more modern way for version

try:
    VERSION = get_distribution(MODULE_NAME).version
except DistributionNotFound:
    VERSION = "0.1.dev0"

##
##  this does the multilingual stuff, don't understand all the mechanism,
##  syntax etc...
##
def find_locale_dir():

    ##  FIXME

    # for prefix in (os.path.dirname(__file__), sys.prefix):
    #    locale_dir = os.path.join(prefix, "locale")
    #    if os.path.isdir(locale_dir):
    #        return locale_dir

    locale_dir = None
    return locale_dir


TRANSLATION_CATALOG = gettext.translation(
    "sbagit", localedir=find_locale_dir(), fallback=True
)

_ = TRANSLATION_CATALOG.gettext

__doc__ = (
    _(
        """
sbagit.py is a front end for bagit.py

BagIt is a directory, filename convention for bundling an arbitrary set of
 files with a manifest, cryptographic hashes, and additional metadata. More
 about BagIt can be found at:

    http://purl.org/net/bagit

sbagit.py is a pure python drop in library and command line tool for creating,
 and working with serialized BagIt directories.

Command-Line Usage:

Basic usage is to give sbagit.py a directory to bag up and serialize:

    $ sbagit.py --[tar|zip] my_directory

This does a bag-in-place operation where the current contents will be moved
 into the appropriate BagIt structure and the metadata files will be created
 followed by a serialization, either 'tar' or 'zip' container.

The container file can optionally show a '.tar' or '.zip' suffix. 

You can bag multiple directories if you wish:

    $ sbagit.py --[tar|zip] directory1 directory2

Optionally you can provide metadata which will be stored in bag-info.txt:

    $ sbagit.py --source-organization "Library of Congress" --[tar|zip] directory

You can also select which manifest algorithms will be used:

    $ sbagit.py --sha1 --md5 --sha256 --sha512 --[tar|zip] directory


Using sbagit from your Python code:

    import sbagit
    sbag = sbagit.make_sbag('example-directory', {'Contact-Name': 'Ed Summers'})
    print(sbag.entries)
"""
    )
    ##  what does  this do here?
    % globals()
)

# standard bag-info.txt metadata
STANDARD_BAG_INFO_HEADERS = [
    "Source-Organization",
    "Organization-Address",
    "Contact-Name",
    "Contact-Phone",
    "Contact-Email",
    "External-Description",
    "External-Identifier",
    "Bag-Size",
    "Bag-Group-Identifier",
    "Bag-Count",
    "Internal-Sender-Identifier",
    "Internal-Sender-Description",
    "BagIt-Profile-Identifier",
    # Bagging-Date is autogenerated
    # Payload-Oxum is autogenerated
]

CRYPTOGRAPHIC_HASH_ALGOS = hashlib.algorithms_guaranteed

##  this is only to support describing default
DEFAULT_CRYPTOGRAPHIC_HASHES = ["sha256", "sha512"]


def make_sbag(
    bag_dir,
    bag_info=None,
    processes=1,
    cryptographic_hashes=None,
    encoding="utf-8",
    tar=None,
    zip=None,
    show_suffix=None,
    adjunct_files=None,
):

    LOGGER.info(_("Creating serialized bag for directory %s"), bag_dir)

    bag = bagit.make_bag(
        bag_dir,
        bag_info=bag_info,
        processes=processes,
        checksums=cryptographic_hashes,
    )

    if adjunct_files:
        for p in adjunct_files:
            if p.is_file():
                shutil.copy(p, Path(bag_dir) / p.name)

    ## FIXME sort out error handling

    if tar:
        suffix = "tar"
    elif zip:
        suffix = "zip"
    else:
        ##  FIXME
        raise SBagError(_("where is suffix"))

    p = Path(bag_dir)
    base_name = p
    root_dir = p.parent.resolve()
    base_dir = p.name

    ##  FIXME need to test that output file is not already present

    file_name = shutil.make_archive(
        base_name=base_name,
        format=suffix,
        root_dir=root_dir,
        base_dir=base_dir,
    )

    ##  FIXME should test that serialized bag exists

    ##  empty the bag_dir
    ##  need to confirm this works on windows
    shutil.rmtree(p.resolve())

    ##  re-establish the directory
    Path.mkdir(p)

    ##  relocate serialized bag
    dst = p / p.name
    if show_suffix:
        extent = "."
        extent += suffix
        dst = dst.with_suffix(extent)

    f = shutil.move(file_name, dst)
    sbag_name = f.resolve()
    ##
    ##  check if sbag_name is a Path object or a string
    ##

    return SBag(sbag_name, bag)


class SBag(object):

    """A representation of a serialized bag."""

    def __init__(self, sbag_name=None, bag=None):
        super(SBag, self).__init__()

        #print("---------------------")
        #print("sbag_name:", sbag_name)
        #print("=====================")
        #self.info = {}
        #self.tags = {}
        #self.entries = {}
        #self.algorithms = []
        self.adjunct = {}
        self.payload = {}
        self.fixity = {}
        self.container = ""

        if sbag_name:
            if not Path(sbag_name).is_file():
                return None
            
            self.name = Path(sbag_name)
            
            if tarfile.is_tarfile(self.name):
                self.container = "tar"
                container = tarfile.TarFile(self.name)
                LOGGER.info(_("Identified sbag container as 'tar'"))
            elif zipfile.is_zipfile(self.name):
                self.container = "zip"
                container = zipfile.ZipFile(self.name)
                LOGGER.info(_("Identified sbag container as 'zip'"))
            else:
                LOGGER.info(_("Failed to identify sbag"))
                ##  this is a fail
                ##  FIXME
                return None

            content = []
            if self.container == "tar":
                content = container.getnames()
            elif self.container == "zip":
                content = container.namelist()

            root = Path("")
            for entry in sorted(content):
                e = Path(entry)
                if str(e) == str(e.name):
                    root = e
                else:
                    data = Path(root / Path("data"))
                    if str(data) == str(e):
                        pass
                    else:
                        if re.match(str(data), str(e)):
                            string = str(e)
                            txt = string.split("/data/", 1)
                            self.payload[txt[1]] = True
                        else:
                            if str(e.name) == "bag-info.txt":
                                pass
                            elif str(e.name) == "bagit.txt":
                                pass
                            elif re.match("manifest-", str(e.name)) or re.match("tagmanifest-", str(e.name)):
                                pass
                            else:
                                self.adjunct[str(e.name)] = True

        ## should this be 'elif bag:'
        if bag:
            self.path = bag.path
            self.info = bag.info
            self.tags = bag.tags
            self.entries = bag.entries
            self.algorithms = bag.algorithms

    def __str__(self):
        ##  this is the full file name
        return str(self.name)

    def validate(self, processes=1, fast=False, completeness_only=False):
        """
        Checks the structure and contents are valid.

        If you supply the parameter fast=True the Payload-Oxum (if present) will
        be used to check that the payload files are present and accounted for,
        instead of re-calculating fixities and comparing them against the
        manifest. By default validate() will re-calculate fixities (fast=False).
        """

        with TemporaryDirectory() as temp_dir:
            bag_dir = self._unpack_sbag(directory=temp_dir)
            bag = bagit.Bag(str(bag_dir))
            result = bag.validate(processes=1,
                                  fast=False,
                                  completeness_only=False)
            self.path = bag.path
            self.info = bag.info
            self.tags = bag.tags            
            self.entries = bag.entries
            self.algorithms = bag.algorithms
            #self.payload = bag.payload
        
        return True

    def is_valid(self, fast=False, completeness_only=False):
        """
        Returns validation success or failure as boolean.
        Optional fast parameter passed directly to validate().
        """
        try:
            self.validate(fast=fast, completeness_only=completeness_only)
        except SBagError:
            return False

        return True

    def _unpack_sbag(self, directory=None):
        p = Path(directory)
        if self.name.is_file() and p.is_dir():
            ##  we expect to be dealing with a serialized bag
            if tarfile.is_tarfile(self.name):
                container = "tar"
                LOGGER.info(_("Identified sbag container as 'tar'"))
            elif zipfile.is_zipfile(self.name):
                container = "zip"
                LOGGER.info(_("Identified sbag container as 'zip'"))
            else:
                ## nothing more we can do here
                ## raise an error
                return None

            self.container = container

            ##  unpack into temporary directory
            LOGGER.info(_("Unpacking sbag %s"), self.name)
            shutil.unpack_archive(self.name, extract_dir=p, format=container)
            ##  this should contain the bag directory
            name = Path(self.name.name).with_suffix("")
            bag_dir = Path(p / name)
            if bag_dir.is_dir():
                LOGGER.info(_("Found bag directory %s"), bag_dir)
            else:
                ## fail
                bag_dir = None
            return bag_dir  ## a Path type
        else:
            ##  fatal error
            ##  do some logging
            return None

    ##
    ##  possible methods for the future taken from bagit.py
    ##

    def manifest_files(self):
        pass

    def tagmanifest_files(self):
        pass

    def compare_manifests_with_fs(self):
        pass

    def compare_fetch_with_fs(self):
        pass

    def payload_files(self):
        pass

    def payload_entries(self):
        pass

    def save(self, processes=1, manifests=False):
        pass

    def tagfile_entries(self):
        pass

    def missing_optional_tagfiles(self):
        pass

    def fetch_entries(self):
        pass

    def files_to_be_fetched(self):
        pass

    def has_oxum(self):
        pass


class SBagError(Exception):
    pass


class SBagArgumentParser(argparse.ArgumentParser):
    def __init__(self, *args, **kwargs):
        argparse.ArgumentParser.__init__(self, *args, **kwargs)
        self.set_defaults(bag_info={})


class SBagHeaderAction(argparse.Action):
    def __call__(self, parser, namespace, values, option_string=None):
        opt = option_string.lstrip("--")
        opt_caps = "-".join([o.capitalize() for o in opt.split("-")])
        namespace.bag_info[opt_caps] = values


def _make_parser():
    parser = SBagArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="sbagit version %s\n\n%s\n" % (VERSION, __doc__.strip()),
    )
    parser.add_argument(
        "--processes",
        type=int,
        dest="processes",
        default=1,
        help=_(
            "Use multiple processes to calculate cryptographic hashes faster"
            " (default: %(default)s)"
        ),
    )

    parser.add_argument("--log", help=_("The name of the log file (default: stdout)"))

    parser.add_argument(
        "--quiet",
        action="store_true",
        help=_("Suppress all progress information other than errors"),
    )

    parser.add_argument(
        "--validate",
        action="store_true",
        help=_(
            "Validate bag that has been serialized in the container file name" " given."
        ),
    )
    parser.add_argument(
        "--fast",
        action="store_true",
        help=_(
            "Modify --validate behaviour to only test whether the bag directory"
            " has the number of files and total size specified in Payload-Oxum"
            " without performing cryptographic hash validation to detect corruption."
        ),
    )

    parser.add_argument(
        "--completeness-only",
        action="store_true",
        help=_(
            "Modify --validate behaviour to test whether the bag directory"
            " has the expected payload specified in the cryptographic hash manifests"
            " without performing cryptographic hash validation to detect corruption."
        ),
    )

    parser.add_argument(
        "--tar",
        action="store_true",
        help=_("Make a valid bag structure and then create a tarball archive."),
    )

    parser.add_argument(
        "--zip",
        action="store_true",
        help=_(
            "Make a valid bag structure and then create a zip archive with minimal"
            " compression. [#### check this is 'STORED' ####]"
        ),
    )

    parser.add_argument(
        "--show-suffix",
        action="store_true",
        help=_(
            "When using tar or zip to serialize the bag structure show a container"
            " file name suffix (default: no suffix)."
        ),
    )

    cryptographic_hash_args = parser.add_argument_group(
        _("Cryptographic hash algorithms"),
        _(
            "Select the manifest algorithms to be used when creating bags"
            " (default=%s)"
        )
        % ", ".join(DEFAULT_CRYPTOGRAPHIC_HASHES),
    )

    for i in CRYPTOGRAPHIC_HASH_ALGOS:
        alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper())
        cryptographic_hash_args.add_argument(
            "--%s" % i,
            action="append_const",
            dest="cryptographic_hashes",
            const=i,
            help=_("Generate %s manifest when creating a bag") % alg_name,
        )

    metadata_args = parser.add_argument_group(_("Optional Bag Metadata"))
    for header in STANDARD_BAG_INFO_HEADERS:
        metadata_args.add_argument(
            "--%s" % header.lower(),
            type=str,
            action=SBagHeaderAction,
            default=argparse.SUPPRESS,
        )

    parser.add_argument(
        "--adjunct_file",
        action="append",
        dest="adjunct_files",
        help=_("The name of an adjunct file to include in the sbag"),
    )

    parser.add_argument(
        "directory",
        nargs="+",
        help=_(
            "Directory which will be converted firstly into a bag in"
            " place by moving any existing files into the BagIt structure and"
            " creating the manifests and other metadata followed by"
            " serializing the stucture into either a 'tar' or 'zip' container."
        ),
    )

    return parser


def _configure_logging(opts):
    ##
    ##  is there a better way for doing this?
    ##
    log_format = "%(asctime)s - %(levelname)s - %(message)s"
    if opts.quiet:
        level = logging.ERROR
    else:
        level = logging.INFO
    if opts.log:
        logging.basicConfig(filename=opts.log, level=level, format=log_format)
    else:
        logging.basicConfig(level=level, format=log_format)


def main():
    if "--version" in sys.argv:
        print(_("sbagit version %s") % VERSION)
        sys.exit(0)

    parser = _make_parser()
    args = parser.parse_args()

    if args.processes < 0:
        parser.error(_("The number of processes must be 0 or greater"))

    if args.fast and not args.validate:
        parser.error(_("--fast is only allowed as an option for --validate!"))

    if args.tar and args.zip:
        parser.error(_("only one of --tar and --zip options is allowed!"))

    if not args.tar and not args.zip and not args.validate:
        parser.error(_("one of options --tar or --zip is required"))

    if args.show_suffix and not args.tar and not args.zip:
        parser.error(
            _("--show_suffix is only allowed as an option for --tar or --zip!")
        )

    _configure_logging(args)

    returncode = 0
    for bag_dir in args.directory:
        # validate the bag
        if args.validate:
            try:
                sbag = SBag(bag_dir)
                # validate throws a SBagError or SBagValidationError
                sbag.validate(
                    processes=args.processes,
                    fast=args.fast,
                    completeness_only=args.completeness_only,
                )
                if args.fast:
                    LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir)
                else:
                    LOGGER.info(_("%s is valid"), bag_dir)
            except SBagError as e:
                LOGGER.error(
                    _("%(sbag)s is invalid: %(error)s"), {"sbag": bag_dir, "error": e}
                )
                returncode = 1

        # make the bag
        else:
            try:
                make_sbag(
                    bag_dir,
                    bag_info=args.bag_info,
                    adjunct_files=args.adjunct_files,
                    processes=args.processes,
                    cryptographic_hashes=args.cryptographic_hashes,
                    tar=args.tar,
                    zip=args.zip,
                    show_suffix=args.show_suffix,
                )
            except Exception as exc:
                LOGGER.error(
                    _(
                        "Failed to create serialized bag in %(bag_directory)s: %(error)s"
                    ),
                    {"bag_directory": bag_dir, "error": exc},
                    exc_info=True,
                )
                returncode = 1

    sys.exit(returncode)


if __name__ == "__main__":
    main()
