Source code for uwsift.project.organize_data_topics

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
organize_data_topics.py
~~~~~~~~~~~~~~~~~~~~~~~

PURPOSE
Mirror the default AHI data directory structure in to a directory structure organized in to topics.

Note: Hardlinks are required for this and not softlinks because cwrsync for Windows does not properly support softlinks
from basic testing.

Data for SIFT is organized as `/odyssey/isis/tmp/davidh/sift_data/ahi/YYYY_MM_DD_JJJ/HHMM/*B[01-16]*.tif`.
This script will organize them in to `/odyssey/isis/tmp/davidh/sift_data/ahi/<topic>/*B[01-16]*.tif`. All of the files
for a specific case will be in the same directory.

REFERENCES


REQUIRES


:author: David Hoese <david.hoese@ssec.wisc.edu>
:copyright: 2014 by University of Wisconsin Regents, see AUTHORS for more details
:license: GPLv3, see LICENSE for more details
"""
__docformat__ = "reStructuredText"
__author__ = "davidh"

import logging
import os
import re
import sys
from collections import namedtuple
from datetime import datetime, timedelta
from glob import glob

LOG = logging.getLogger(__name__)

FILENAME_RE = r"HS_H08_(?P<date>\d{8})_(?P<time>\d{4})_(?P<band>B\d{2})_FLDK_(?P<res>R\d+)\.(?P<ext>.+)"
fn_re = re.compile(FILENAME_RE)

DT_FORMAT = "%Y%m%d_%H%M"
CASE_NAME_FORMAT = "{start}_{end}_{delta:02d}"

DataCase = namedtuple("DataCase", ["topic_title", "start", "end", "delta", "bands"])

### Guam Cases ###
guam_cases: dict = {}
# Kathy's Cases
guam_cases["Introduction"] = []
guam_cases["Introduction"].append(
    DataCase(
        "Introduction", datetime(2015, 7, 17, 21, 0, 0), datetime(2015, 7, 18, 20, 0, 0), timedelta(minutes=60), "all"
    )
)
guam_cases["Introduction"].append(
    DataCase(
        "Introduction", datetime(2015, 7, 18, 1, 0, 0), datetime(2015, 7, 18, 3, 20, 0), timedelta(minutes=10), "all"
    )
)
guam_cases["Introduction"].append(
    DataCase(
        "Introduction", datetime(2015, 7, 18, 14, 0, 0), datetime(2015, 7, 18, 16, 0, 0), timedelta(minutes=10), "all"
    )
)
guam_cases["Introduction"].append(
    DataCase("Introduction", datetime(2016, 3, 9, 0, 0, 0), datetime(2016, 3, 9, 4, 0, 0), timedelta(minutes=60), "all")
)
guam_cases["Introduction"].append(
    DataCase(
        "Introduction", datetime(2016, 3, 9, 1, 30, 0), datetime(2016, 3, 9, 4, 0, 0), timedelta(minutes=10), "all"
    )
)

# Scott's Cases
guam_cases["Water Vapor"] = []
guam_cases["Water Vapor"].append(
    DataCase(
        "Water Vapor", datetime(2015, 10, 7, 0, 0, 0), datetime(2015, 10, 8, 0, 0, 0), timedelta(minutes=30), "all"
    )
)
guam_cases["Water Vapor"].append(
    DataCase(
        "Water Vapor", datetime(2016, 2, 19, 19, 0, 0), datetime(2016, 2, 20, 5, 0, 0), timedelta(minutes=60), "all"
    )
)

# Tim's Cases
guam_cases["Weighting Functions"] = []
guam_cases["Weighting Functions"].append(
    DataCase(
        "Weighting Functions",
        datetime(2015, 9, 20, 2, 30, 0),
        datetime(2015, 9, 20, 2, 30, 0),
        timedelta(minutes=0),
        "all",
    )
)
guam_cases["Weighting Functions"].append(
    DataCase(
        "Weighting Functions",
        datetime(2015, 9, 20, 0, 0, 0),
        datetime(2015, 9, 20, 6, 0, 0),
        timedelta(minutes=60),
        "all",
    )
)
guam_cases["Weighting Functions"].append(
    DataCase(
        "Weighting Functions",
        datetime(2015, 9, 20, 1, 30, 0),
        datetime(2015, 9, 20, 2, 30, 0),
        timedelta(minutes=10),
        "all",
    )
)
guam_cases["Weighting Functions"].append(
    DataCase(
        "Weighting Functions",
        datetime(2015, 9, 20, 1, 0, 0),
        datetime(2015, 9, 20, 3, 0, 0),
        timedelta(minutes=10),
        "all",
    )
)

# Jordan's Cases
guam_cases["Extra"] = []
guam_cases["Extra"].append(
    DataCase("Extra", datetime(2015, 8, 17, 12, 0, 0), datetime(2015, 8, 18, 12, 0, 0), timedelta(minutes=60), "all")
)
guam_cases["Extra"].append(
    DataCase("Extra", datetime(2015, 8, 17, 22, 0, 0), datetime(2015, 8, 18, 1, 0, 0), timedelta(minutes=10), "all")
)
guam_cases["Extra"].append(
    DataCase("Extra", datetime(2015, 8, 24, 15, 0, 0), datetime(2015, 8, 15, 21, 0, 0), timedelta(minutes=60), "all")
)
guam_cases["Extra"].append(
    DataCase("Extra", datetime(2015, 8, 25, 2, 0, 0), datetime(2015, 8, 25, 5, 0, 0), timedelta(minutes=10), "all")
)


[docs] def main(): import argparse parser = argparse.ArgumentParser(description="Regenerate or generate mirrored AHI data structure") parser.add_argument( "base_ahi_dir", default="/odyssey/isis/tmp/davidh/sift_data/ahi", help="Base AHI directory for the geotiff data files " "(next child directory is the full dated directory)", ) parser.add_argument( "-v", "--verbose", dest="verbosity", action="count", default=int(os.environ.get("VERBOSITY", 2)), help="each occurrence increases verbosity 1 level through " "ERROR-WARNING-Info-DEBUG (default Info)", ) parser.add_argument("--overwrite", action="store_true", help="Overwrite existing hardlinks") args = parser.parse_args() levels = [logging.ERROR, logging.WARN, logging.INFO, logging.DEBUG] level = levels[min(3, args.verbosity)] logging.basicConfig(level=level) if not os.path.isdir(args.base_ahi_dir): raise NotADirectoryError("Directory does not exist: %s" % (args.base_ahi_dir,)) os.chdir(args.base_ahi_dir) for section_name, cases in guam_cases.items(): for case in cases: start_str = case.start.strftime(DT_FORMAT) end_str = case.end.strftime(DT_FORMAT) # Note this only uses the minutes! case_name = CASE_NAME_FORMAT.format( start=start_str, end=end_str, delta=int(case.delta.total_seconds() / 60.0) ) case_dir = os.path.join(args.base_ahi_dir, section_name, case_name) if not os.path.isdir(case_dir): LOG.debug("Creating case directory: %s", case_dir) os.makedirs(case_dir) else: LOG.error("Case directory already exists: %s", case_dir) continue t = case.start while t <= case.end: glob_pattern = t.strftime("%Y_%m_%d_%j/%H%M/*_%Y%m%d_%H%M_B??_*.merc.tif") t = t + case.delta matches = glob(glob_pattern) if len(matches) == 0: LOG.error("Zero files found matching pattern: %s", glob_pattern) continue for input_pathname in matches: fn = os.path.basename(input_pathname) link_path = os.path.join(case_dir, fn) if os.path.exists(link_path) and not args.overwrite: LOG.debug("Link '%s' already exists, skipping...", link_path) continue LOG.debug("Creating hardlink '%s' -> '%s'", link_path, input_pathname) os.link(input_pathname, link_path) if int(case.delta.total_seconds()) == 0: LOG.debug("Only one file needed to meet delta of 0") break LOG.info("done mirroring files")
if __name__ == "__main__": sys.exit(main())