#!python

'''
    Chopped up from flatfile code, just one-off for identifying never_published VAERS_IDs.

    Gary Hawkins
        http://univaers.com/download/
'''

from datetime    import datetime
from pathlib     import Path
import glob, os, sys, re, shutil, pprint, inspect
import subprocess as sp
import time       as _time
import pandas     as pd
import zipfile_deflate64 as zipfile     # use ... pip install zipfile-deflate64 ... that solves it.

date_floor              = ''   #'2022-11-11'          # Useful any time for no redo of what's done
date_ceiling            = ''   #'2021-05-15'   #'2021-12-16'          # In testing to stop if greater (newer) than this

dir_top                 = '.'

use_test_cases          = 0
if use_test_cases:
    dir_top             = 'z_test_cases'

dir_input               = f'{dir_top}/../Download/ALL_VAERS_DROPS'
dir_compared            = f'{dir_top}/vaers_full_compared'                  # changed from vaers_changes, resulting in a tad bit of cognitive confusion throughout the code now but overall better.
dir_working             = f'{dir_top}/vaers_working'
dir_flattened           = f'{dir_top}/vaers_flattened'
dir_consolidated        = f'{dir_top}/vaers_consolidated'

file_stats              = f'{dir_top}/stats.csv'
file_never_published    = f'{dir_top}/any_never_published.txt'     # Even non-covid vax's, no possibility of knowing whether covid or other
file_any_ever           = f'{dir_top}/all_ever_seen.txt'           # Even non-covid vax's
file_covid_ever         = f'{dir_top}/all_covid_seen.txt'          # Only covid reports
file_symptoms_deduped   = f'{dir_top}/symptoms_deduped.txt'

if use_test_cases:
    dir_input           = f'{dir_top}/drops'

tones                   = 0
floor_notice_printed    = 0
ceiling_notice_printed  = 0
covid_earliest_vaers_id = 890871  #896636            # was during trials but delayed, not published 2020-12-18 but later instead, gapfill
elapsed_begin           = _time.time()
elapsed_drop            = _time.time()
df_data                 = pd.DataFrame()
df_stats                = pd.DataFrame()
any_never_published     = {}
covid_ever              = {}    # grows, all covid VAERS_IDs ever identified
any_ever                = {}    # Any vax
files                   = {}
dict_done_flag          = {}
stats                   = {}

def validate_dirs_and_files():
    '''    Create directories if not existing     '''
    global any_never_published, any_ever, covid_ever
    print() ; print('validate_dirs_and_files() ...')

    if not os.path.exists(dir_input):
        print(f'The expected  inputs directory   of CDC drops does not exist: {dir_input }')
        os.makedirs(dir_input )
        print('    That directory has been created and should be populated with each data drop to process')
        print('       ... and containing the CDC csv or zip files')
        exit('\n    see validate_dirs_and_files()')
    if not os.path.exists(dir_working):
        print(f'    The expected      working directory for processing    does not exist, creating {dir_working}')
        os.makedirs(dir_working)
    if not os.path.exists(dir_consolidated):
        print(f'    The expected consolidated directory for storing files does not exist, creating {dir_consolidated}')
        os.makedirs(dir_consolidated)
    if not os.path.exists(dir_flattened):
        print(f'    The expected    flattened directory for storing files does not exist, creating {dir_flattened}')
        os.makedirs(dir_flattened)
    if not os.path.exists(dir_compared):
        print(f'    The expected       output directory for changes       does not exist, creating {dir_compared}')
        os.makedirs(dir_compared)

    if use_test_cases:  # remove previous compared
        for filename in Path(dir_compared).glob("*.csv"):
            print(f'Delete {filename}')
            filename.unlink()  # delete

        if os.path.exists(file_stats):            Path(file_stats)              .unlink()
        if os.path.exists(file_never_published):  Path(file_never_published)    .unlink()
        if os.path.exists(file_any_ever):         Path(file_any_ever)           .unlink()
        if os.path.exists(file_covid_ever):       Path(file_covid_ever)         .unlink()
        if os.path.exists(file_symptoms_deduped): Path(file_symptoms_deduped)   .unlink()

    if os.path.exists(file_never_published):
        with open(file_never_published, 'r') as f:
            lines = f.readlines()
            lines = [x.strip() for x in lines]
            lines = [ int(x)   for x in lines if x ]
            any_never_published = {x:1 for x in lines}
    else:
        with open(file_never_published, 'w'):       # create it empty
            pass

    if os.path.exists(file_any_ever):
        with open(file_any_ever, 'r') as f:
            lines = f.readlines()
            lines = [x.strip() for x in lines]
            lines = [ int(x)   for x in lines if x ]
            any_ever = {x:1    for x in lines}
    else:
        with open(file_any_ever, 'w'):       # create it empty
            pass

    if os.path.exists(file_covid_ever):
        with open(file_covid_ever, 'r') as f:
            lines = f.readlines()
            lines = [x.strip() for x in lines]
            lines = [ int(x)   for x in lines if x ]
            covid_ever = {x:1  for x in lines}
    else:
        with open(file_covid_ever, 'w'):       # create it empty
            pass

    files_populate_information()

    if not files['input']['files']:
        exit(f"    No csv or zip files in dir_input {dir_input}, no point in continuing")

    print()
    print(f"{(len(files['input']['date']) - len(files['changes']['date'])):>10} drops in input to process") ; print()
    print(f"    First (oldest) input: {files['input']['files'][0]}")
    print(f"     Last (newest) input: {files['input']['files'][-1]}")

    files_changes = sorted(files['changes']['files'])
    date_changes  = sorted(files['changes']['date' ])
    date_input    = sorted(files['input'  ]['date' ])

    if files_changes:
        if date_input[-1] <= date_changes[-1]:
            print()
            if len(files_changes) >= 2:
                print(f"    Second-to-last changes file is {files_changes[-2]}")
            if len(files_changes) >= 1:
                print(f"     Last (newest) changes file is {files_changes[-1]}")

        print() ; print(f'    Already processed files do appear in {dir_compared} and the latest will be built upon:')

        count = 0
        for f in files_changes:    # showing some of those
            print(f'        {f}')
            count += 1
            if count >= 5:
                print(f'            ... {len(files_changes)} total')
                break
        print()
    else:
        '''    First covid 2020-12-25 treatment initially or other with date_floor     '''
        print() ; print(f'    No processed files show up in {dir_compared}. Is the first in the loop.')

        return

def files_from_zip(zip_file, dir_dst):
    '''   This requires ... pip install zipfile-deflate64 ... to handle zip straight from https://vaers.hhs.gov/data/datasets.html
            See https://stackoverflow.com/a/73040025/962391
            The alternative is to unzip and rezip to get away from compression type 9 (deflate64), a licensing issue    '''
    archive = zipfile.ZipFile(zip_file)

    print(f'    unzip {zip_file}')

    for file in archive.namelist():     # only 2020... and NonDomestic files
        if file.startswith('202') or file.lower().startswith('nond'):
            archive.extract(file, './' + dir_dst)

def files_populate_information():
    '''    Often updating in 'files' variable            '''
    global floor_notice_printed, ceiling_notice_printed

    if not files:       # make the keys
        for x in ['input', 'working', 'flattened', 'changes', 'consolidated']:
            files[x] = {}
            for y in ['date', 'files']:
                files[x][y] = []
            # set _dir
            if   x ==        'input': files[x]['_dir'] = dir_input
            elif x ==      'working': files[x]['_dir'] = dir_working
            elif x ==      'changes': files[x]['_dir'] = dir_compared
            elif x ==    'flattened': files[x]['_dir'] = dir_flattened
            elif x == 'consolidated': files[x]['_dir'] = dir_consolidated

    # current values
    for thing in list(files.keys()):
        _dir = files[thing]['_dir']

        # filenames only and made lowercase)
        full = sorted( [y for x in os.walk(_dir) for y in glob.glob(os.path.join(x[0], '*' + '.*'))] )
        # note other files/dirs can be there without a problem, only .csv or .zip are picked up
        full = [x for x in full if re.search(r'\\\d{4}\-\d{2}\-\d{2}', x)]    # file must start with a date like 2020-12-24
        full = [linux_path(x) for x in full]
        full = [x for x in full if (x.lower().endswith('.csv') or x.lower().endswith('.zip'))]
        full = [x for x in full if not (x.lower().endswith('_a.csv') or x.lower().endswith('_b.csv'))]

        # date only like 2020-12-24
        files[thing]['date']   = sorted( set( [date_from_filename(x) for x in full] ) )  # uniquing in the case of test cases CSV inputs

        # date to either the zip file or directory name
        files[thing]['keyval'] = {date_from_filename(x) : x for x in full}
        files[thing]['valkey'] = {x : date_from_filename(x) for x in full}
        files[thing]['files' ] = list(files[thing]['valkey'].keys())

    # Hack for testing when input files are only flattened files rather than in drops dir.
    if use_test_cases:
        files[ 'input' ] = files[ 'flattened' ]

    do_file_limits = 0
    if date_floor:
        do_file_limits = 1
        if not floor_notice_printed:
            print(f'\n\n\n\t\t   date_floor is set at {  date_floor}, limiting files\n\n')
            floor_notice_printed = 1

    if date_ceiling:
        do_file_limits = 1
        if not ceiling_notice_printed:
            print(f'\n\n\n\t\t date_ceiling is set at {date_ceiling}, limiting files\n\n')
            ceiling_notice_printed = 1

    if do_file_limits:     # remove those that don't apply
        if date_floor:
            files['input']['date'] = [x for x in files['input']['date'] if x >= date_floor]
        if date_ceiling:
            files['input']['date'] = [x for x in files['input']['date'] if x <= date_ceiling]
        for y in ['input']:   # only this
            files[y]['date']   = [x   for x    in files[y]['date']           if x in files['input']['date']]
            files[y]['keyval'] = {k:v for k, v in files[y]['keyval'].items() if k in files['input']['date']}
            files[y]['valkey'] = {k:v for k, v in files[y]['valkey'].items() if v in files['input']['date']}
            files[y]['files' ] = list(files[y]['valkey'].keys())


    #pp.pprint(files)
    return

def open_file_to_df(filename, doprint=1):
    '''    Read CSV filename into dataframe df    '''
    df = 'see open_file_to_df()'
    try:
        if doprint:
            print(f'        open {filename:>54}', flush=True, end='')
        with open(filename, encoding='utf-8-sig', errors='replace') as f:   # 'utf-8-sig' and 'ISO-8859-1', need to resolve this
            df = pd.read_csv(f, index_col=None, header=0, sep=',', engine='python', encoding='ISO-8859-1').fillna('')
            if doprint:
                max_vid = 'ok'
                if 'VAERS_ID' in df.columns:
                    max_vid = f'Highest VAERS_ID  {df.VAERS_ID.astype(int).max():>7}'
                print(f' ... {max_vid} {len(df):>7} rows')
    except ValueError as e:
        print(f'\n\t{e}')

    df = types_set(df)
    warn_mixed_types(df)

    return df

def open_files(_date):   # like './vaers_drop_inputs/2020-12-25'
    '''    Input files in dir_input:
              csv within directories
              zip files in a single directory, containing csv, treated as if they are folders, sort of.    '''
    files_populate_information()

    if _date in files['consolidated']['date']:      # already consolidated
        print(f'    {_date} already consolidated, no need to copy input files to dir_working')
        shutil.rmtree(dir_working)  # removing directory
        os.mkdir(dir_working)
        set_files_date_marker(dir_working, _date)       # a flag used by consolidate()
        return

    if _date in files['flattened']['date']:
        print(f'    Skipping unzip because flattened for {_date} already exists')
        return

    if not _date in files['input']['keyval']:
        exit(f"     Failed to find in files['input']['keyval'] the _date {_date} in open_files()  ")
    files_value = files['input']['keyval'][_date]

    if 'csv' in files_value:
        print(f'    Copy all {_date} to {dir_working}')
        to_copy = [x for x in files['input']['files'] if _date in x]
        shutil.rmtree(dir_working)
        os.mkdir(dir_working)
        for x in to_copy:
            shutil.copy(x, dir_working)
    elif isinstance(files_value, list):               # another case earlier in development, csv files already extracted manually
        print(f'    Copy {_date}/* to {dir_working}')
        shutil.rmtree(dir_working)                  # removing directory to avoid error next line
        shutil.copytree(_date, dir_working)
        set_files_date_marker(dir_working, _date)

    elif 'zip' in files_value:                      # zip file, treat it sort of like a directory here
        shutil.rmtree(dir_working)
        os.makedirs(dir_working)
        files_from_zip(files_value, dir_working)
    else:
        exit(f'    Unexpected _date {_date} in open_files() ')

    set_files_date_marker(dir_working, _date)

    return

def warn_mixed_types(df):
    if df is None or (not len(df)):
        return
    len_types_unique = len(df.applymap(type).drop_duplicates())
    if len_types_unique > 1:
        print(f'\n\nline {inspect.stack()[ 1 ][ 2 ]}  MIXED TYPES: {len_types_unique}')
        for col in df:
            list_of_types_complex = df[[col]].applymap(type).drop_duplicates().values.astype(str).tolist()
            if len( list_of_types_complex ) > 1:
                types_simple = []
                for x in list_of_types_complex:
                    if   'int'   in str(x): types_simple.append('int')
                    elif 'str'   in str(x): types_simple.append('str')
                    elif 'float' in str(x): types_simple.append('float')
                    elif 'obj'   in str(x): types_simple.append('obj')
                    else:
                        if x not in types_simple:
                            types_simple.append(x)
                print(f'            {col} {types_simple}')
        print('\n\n')

def types_set(df):
    '''    FIX move deleted to their cell
    VAERS_ID	cell_edits	status	changes	VAX_TYPE	VAX_MANU	VAX_LOT	VAX_DOSE_SERIES	VAX_ROUTE	VAX_SITE	VAX_NAME	RECVDATE	STATE
    AGE_YRS	CAGE_YR	CAGE_MO	SEX	RPT_DATE	SYMPTOM_TEXT	DIED	DATEDIED	L_THREAT	ER_VISIT	HOSPITAL	BIRTH_DEFECT	OFC_VISIT	ER_ED_VISIT	HOSPDAYS	X_STAY	DISABLE	RECOVD
    VAX_DATE	ONSET_DATE	NUMDAYS	LAB_DATA	V_ADMINBY	V_FUNDBY	OTHER_MEDS	CUR_ILL	HISTORY	PRIOR_VAX	SPLTTYPE	FORM_VERS
    TODAYS_DATE	ALLERGIES	symptom_entries
    '''
    if 'VAERS_ID' not in df.columns:
        return df      # skip stats.csv etc
    if 'gapfill'      in df.columns:
        return df      # skip stats.csv etc

    df = df.copy()
    df = df.fillna('')

    df = df.astype(str)     # all columns as string, then fix some

    for col in ['VAERS_ID', 'HOSPDAYS', 'NUMDAYS']:
        if col in df.columns:
            df      = make_numeric(df, col)
            df[col] = df.loc[(~df[col].isna()) & df[col].ne(''), col].astype('float64').astype(int)  # .astype(int) in pandas doesn't work, they remain float

    for col in ['AGE_YRS', 'CAGE_YR', 'CAGE_MO']:
        if col in df.columns:
            df = make_numeric(df, col)

    if 'AGE_YRS' in df.columns:     # a way of determining only data, not vax or syms df's
        for col in ['status', 'changes']:   # Initializing everything for the time-being for simplicity, and perhaps sanity
            if col not in df.columns:
                df[col] = ''
        del col
        if 'cell_edits' in df.columns:
            df.loc[ (df[ 'cell_edits' ].isna()) | df[ 'cell_edits' ].eq(''), 'cell_edits' ] = int(0)
            df['cell_edits'] = df['cell_edits'].astype(int)     # TODO: Why needed?  invalid literal for int() with base 10: '0.0'

        else:
            df['cell_edits'] = int(0)

    return df

def set_files_date_marker(_dir, filename):
    '''    For visual clarity create an empty file in dir with filename as the particular date         '''
    file_date_part = date_from_filename(filename)
    pattern_dir    = _dir + '/'
    if file_date_part:
        files_date_marker = pattern_dir + file_date_part
        if os.path.exists(files_date_marker):
            return
        else:
            print(f'    Creating in {pattern_dir} date marker file {file_date_part}')
            open(files_date_marker, 'a').close()
    else:
        print(f'    FAILED creating in {pattern_dir} date marker file {file_date_part}')
        return('EMPTY IN set_files_date_marker, must fix if hit')

def get_files_date_marker(_dir):
    '''     Date from filename for files in director        '''
    pattern_dir = './' + _dir + '/'
    files_with_date = glob.glob(pattern_dir + '*-*')
    if not files_with_date:
        return('')

    files_with_date = files_with_date[0]    # any will do, thus first in list returned (even if just one of course)
    file_date_part = date_from_filename(files_with_date)

    if file_date_part:
        return file_date_part
    else:
        print('    EMPTY in get_files_date_marker')
        return

def date_from_filename(filename):
    '''    Pull just date portion of a filename         '''
    return re.sub(r'.*(\d{4}\-\d{2}\-\d{2}).*', r'\1', filename)

def files_concat(files_list):
    '''    Join/concatenate files         '''
    df_out = None
    count = 0
    for filename in files_list:
        df = open_file_to_df(filename, doprint=1)
        if count == 0:
            df_out = df.copy()
        else:
            df_out = pd.concat([df_out.reset_index(drop=True), df.reset_index(drop=True) ], ignore_index=True)
        count += 1

    df_out = df_out.fillna('')    # There are nans in all NonDomestic, why?

    '''   VAERS_IDs inconsistently have a leading zero or not, fixing that, uniformity.
             This is important for removal of duplicates for example. Making double sure they are integers.    '''
    df_out['VAERS_ID'] = pd.to_numeric(df_out.VAERS_ID)

    '''
        It's true, CDC publishes files with some full, exact, complete duplicates. Why? An example is 896795 in 2020-12-18 NonDomesticVAERSVAX.csv

        Earlier note: For debug but tricky as df_dupes are not all dupes but merely contain one that is:
            df_duplicated = df_out.loc[df_out.duplicated(subset=df_out.columns) == True].sort_values(by='VAERS_ID')  # each record only once though
            vids_duplicated = df_duplicated.VAERS_ID.to_list()
            df_dupes        = df_out.loc[ df_out.VAERS_ID.isin(vids_duplicated) ]       # to examine them    '''

    len_before = len(df_out)
    #df_out_ori = df_out.copy()
    df_out = df_out.drop_duplicates(df_out.columns).reset_index(drop=True)      # must avoid dropping on default by index, use of df_out.columns
    if len(df_out) - len_before:
        print(f'{(len_before - len(df_out)):>30} exact duplicates dropped in concatenated files, now {len(df_out.VAERS_ID.to_list())} VAERS_IDs')

    return df_out

def linux_path(x):
    return re.sub(r'\\', '/', x)

def subrange(list_in, _max):
    '''    Input list and for print return up to _max (like 5) at the start and end of that list     '''
    if len(list_in) == 0:    return ''      # i.e. not empty square brackets
    if len(list_in) <= _max: return list_in
    list_in   = sorted(list_in)
    this_many = int(max(min(len(list_in) / 2, _max), 1))
    head      =  f'{list_in[:this_many]}'  if len(list_in) > 1 else ''
    head      = re.sub(r'\]', '', head)
    tail      =  f'{list_in[-this_many:]}' if len(list_in) > 1 else ''
    tail      = re.sub(r'\[', '', tail)
    return(f'{len(list_in):>7} {head} ... {tail}')

def single_plural(count, word):
    if count == 1: return re.sub(r'.$', '', word)    # removing 's' on the end if just 1
    else: return word

def line():
    '''    Print line number (debug)            '''
    caller = inspect.getframeinfo(inspect.stack()[1][0])
    print(f'    Line {caller.lineno} in {caller.function}()')

def tone():
    '''    Utility sounding a tone. For example at assign_outliers() main work starting or end of a run    '''
    if not tones:
        return
    print('\a', end='')  # sounds a tone

def do_elapsed(marker_in):
    '''  Calculate elapsed time for a given input marker as the starting point.    '''
    tone()
    elapsd = (_time.time() - marker_in) / 60   # Minutes
    return( f"{'{} hr {} min'.format(int(elapsd / 60), '%.1f' % (elapsd % 60))}" )

def exit(_in=None):
    '''     Exit with message        '''
    if not _in: _in = ''
    print() ; print(f'{_in}')
    do_elapsed(elapsed_begin)
    print() ; print(f'Done with {__file__} at line {inspect.stack()[1][2]}, {str(datetime.now())}')  # clock time
    print() ; print('- - - - - - - - - - - - - - - - - - - - - - - - ')
    tone()
    os._exit(0)

def make_numeric(df_in, col):
    if not len(df_in): return df_in
    df_in      = df_in.copy()
    df_in[col] = df_in[col].fillna('')
    df_in[col] = df_in[col].astype(str)

    df_in[col] = pd.to_numeric(df_in[col], errors='ignore')

    # Desperation, number columns usually should be empty when 0, setting to 0.0 during run, empty them later
    df_in.loc[ (df_in[ col ].isna()), col ] = 0.0

    df_in[ col ] = df_in[ col ].astype('float64').round(4)

    if 'nan' in df_in[col].to_list():
        print(f'is_nan in make_numeric()')
    return df_in

def move_rows(df_subset, df_move_from, df_move_to):
    '''    Move rows in df_subset out of df_move_from into df_move_to
           Return the new df_move_from and df_move_to    '''
    if not len(df_subset):
        return df_move_from, df_move_to
    df_move_to   = df_move_to.copy()
    df_move_to   = pd.concat([df_move_to.reset_index(drop=True), df_subset.reset_index(drop=True) ], ignore_index=True)
    df_move_from = df_move_from.loc[ ~df_move_from.VAERS_ID.isin( df_move_to.VAERS_ID ) ]  # everything from before, not now in done
    return df_move_from, df_move_to

def move_column_forward(df_in, column):
    '''    Reorder columns moving column to second         '''
    if column not in df_in: return df_in
    columns_pre = list(df_in.columns)
    if columns_pre[1] == column: return df_in  # already in that spot

    if 'VAERS_ID' not in df_in.columns:
        col_order = [column]   # start
        for col in df_in.columns:
            if col == column: continue
            col_order.append(col)
        return df_in[col_order]  #df_in.reindex(col_order, axis=1)

    col_order = ['VAERS_ID', column]
    for c in columns_pre:
        if c not in col_order:
            col_order.append(c)
    return df_in.reindex(col_order, axis=1)

def print_date_banner(this_drop_date):
    print()
    print('= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = ')
    print(f'    Next date {this_drop_date}')
    print('= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = ')
    print()

def stats_initialize(date_currently):
    '''    Per drop, stored in stats dictionary, then written to stats.csv
              with each date as a row.
    '''
    global stats

    stats = {
        'date'                   : date_currently,
        'drop_input_covid'       : 0,
        'comparisons'            : 0,   # Number of drops.
        'new'                    : 0,   # New this drop.
        'deleted'                : 0,   # Were visible, now gone.
        'restored'               : 0,   # Had been deleted.
        'gapfill'                : 0,   # Held back but finally showed up.
        'never_published'        : 0,   # Total still being held back. Gaps in the VAERS_ID sequence.
        'any_ever'               : 0,   # Total any jab VAERS_ID's ever seen.
        'covid_ever'             : 0,   # Total covid   VAERS_ID's ever seen.
    }

def do_counts(files_date_marker):
    global covid_ever, any_ever, stats

    print() ; print(f'    do_counts')

    files_populate_information()

    df_data_all = files_concat( glob.glob(dir_working + '/' + '*VAERSDATA.csv'    ) )
    
    print() ; print() ; 
    
    # Checking something
    if 896905 in df_data_all.VAERS_ID.to_list():
        print(f'  ------ {files_date_marker}   896905 in df_data_all.VAERS_ID.to_list()')
        pause=1
    else:
        print(f'  ------ {files_date_marker}   896905  NOT  in df_data_all.VAERS_ID.to_list()')

    len_before = len(df_data_all)
    df_data_all = df_data_all.loc[ df_data_all.VAERS_ID >= covid_earliest_vaers_id ]
    print() ; print(f'{len_before - len(df_data_all):>10} records removed prior to the first covid report (covid_earliest_vaers_id {covid_earliest_vaers_id})')
    print(f'{len(df_data_all):>10} any vax reports to work with (unique VAERS_IDs)') ; print()

    vids_list     = df_data_all.VAERS_ID.to_list()
    lo_data_all   = min(vids_list)
    hi_data_all   = max(vids_list)
    diff_data_all = hi_data_all - lo_data_all
    len_data_all  = len(vids_list)
    missing       = diff_data_all - len_data_all
    print(f'{missing:>10} missing (any/all vax never published in covid era) is implied by only {len_data_all} present in {diff_data_all} range with lo {lo_data_all} and hi {hi_data_all}')
    print()

    dict_vids_list = {x:1 for x in vids_list}
    #vids_new = [ x for x in vids_list if x not in any_ever.keys() ]
    vids_new = [ x for x in vids_list if x not in any_ever ]
    print(f'{len(vids_new):>10} being added to `any_ever` for this drop, any/all vax')
    any_ever.update({x:1 for x in vids_list})
    #any_ever = {**any_ever, **{x:1 for x in vids_list}} 
    #any_ever = {x:1 for x in set(any_ever.keys() + vids_list)}

    len_before = len(df_data_all)
    df_data_all = df_data_all.drop_duplicates(subset='VAERS_ID')
    if len(df_data_all) - len_before:
        print(f'{(len_before - len(df_data_all)):>10} duplicates dropped in df_data_all on VAERS_IDs')
    print(f'{len(set(df_data_all.VAERS_ID.to_list())):>10} ALL/ANY reports to work with') ; print()

    # Grab all VAERS_IDs before filtering for covid, to be able to identify gaps properly
    do_never_ever( vids_list, files_date_marker, 'do_counts on df_data_all' )

    stats['drop_input_covid'] = len(df_data_all)

    print(f'    do_counts of {files_date_marker} done')

    return

def do_never_ever(vids_present, date_currently, source):

    ''' Reports never published, identifying gaps in VAERS_ID sequence. Required to keep all VAERS_IDs ever seen so far, doing so as keys in dictionary.
           Many of these can be non-covid also.
           Use of dictionaries fast, lists slow.
              any_ever file was late on the scene and maybe all of the drop stuff is no longer needed, just high, lo, range (sequence) and gaps.  '''
    global any_never_published, any_ever, stats

    '''     There's surely some crude logic here that could be fixed    '''
    lo_this_drop             = min(vids_present)
    hi_this_drop             = max(vids_present)
    highest_ever             = hi_this_drop
    if any_never_published and any_ever:
        lo_ever              = min( min(any_never_published), min(any_ever), lo_this_drop )
        highest_ever         = max( max(any_never_published), max(any_ever), hi_this_drop )
    else:
        lo_ever              = lo_this_drop
    list_range_ever_any      = sorted(range(lo_ever     , highest_ever + 1))   # range() is non-inclusive at the top end
    list_range_drop_only     = sorted(range(lo_this_drop, hi_this_drop + 1))
    # Patch b/c I'm missing 2021-01-01 drop, these were published but later deleted. Some others may have been deleted since this writing, being missed now, from that date.
    patch = [ 905000, 905553, 907988, 907989, 908265, 908266, 908269, 908279, 909030, 909617, 910294, 910295, 910297, 910298, 910300, 910301, 910303, 910304, 910305, 910312, 910313, 910575, 910577, 910594, 910601, 910617, 910646, 913144 ]
    if 'never_published_patch_done' not in dict_done_flag: dict_done_flag['never_published_patch_done'] = 0  # init thing
    if use_test_cases: dict_done_flag['never_published_patch_done'] = 1
    if (not dict_done_flag['never_published_patch_done']) and (hi_this_drop > min(patch)):    # any case except if in contiguous run restart and not yet at the missing 2021-01-01 drop
        if date_currently > '2021-01-01':                                   # This new year's drop might have been skipped by CDC, don't know.
            dict_patch = {x:1 for x in patch}
            any_ever.update(dict_patch)
            #any_ever = {x:1 for x in set(any_ever.keys() + dict_patch.keys())}
            #any_ever = {**{x:1 for x in patch}, **any_ever}     # adding these few, assuming them published for simplicity
            dict_done_flag['never_published_patch_done'] = 1   # global
            print(f'{len(patch):>10} for missing 2021-01-01 drop patch added to any_ever')
    dict_vids_present   = {x:1 for x in vids_present}
    any_ever.update(dict_vids_present)
    #any_ever = {**any_ever, **dict_vids_present} 
    #any_ever = {x:1 for x in set(any_ever.keys() + dict_vids_present.keys())}
    set_of_gap_fills    = set( set(vids_present) & any_never_published.keys() )
    gaps_new            = set(list_range_drop_only) - set(dict_vids_present.keys())  # any in list_range_drop_only that are not in dict_vids_present
    set_of_never        = set(list_range_ever_any)  - set(any_ever)                  # any in list_range_ever_any  that are not in any_ever (expected in range but never ever seen)
    set_of_never        = set_of_never              - set(any_ever)  # remove any in never if in any_ever making sure
    any_never_published = {x:1 for x in sorted(set_of_never)}
    hi_any_never_published   = max(set_of_never)

    print()
    print(f'    From {source}:')
    print(f'    hi_any_never_published  {hi_any_never_published}')
    print(f'             lo_ever covid  {lo_ever} covid_earliest_vaers_id')
    print(f'        lo_this_drop covid  {lo_this_drop}')
    print(f'        hi_this_drop covid  {hi_this_drop}')
    print(f'        vids_present covid  {subrange(dict_vids_present, 6)}')
    print(f'         gaps_filled covid  {subrange(set_of_gap_fills, 6)}')
    print(f'              gaps_new any  {subrange(gaps_new, 6)}')
    print(f'       any_never_published  {subrange(set_of_never, 6)}')
    print(f'       list_range_ever_any  {subrange(list_range_ever_any, 6)}')

    print(f'    VAERS_IDs any: {lo_ever:>7} to {max(list_range_ever_any):>7}   expected: {len(list_range_ever_any):>7}   any_ever: {len(any_ever)}   any_never_published: {len(set_of_never)}')

    '''
    Sanity check. All VAERS_IDs must be in ONLY either any_ever or any_never_published

        In Python 3, you can use                https://stackoverflow.com/a/49710152/962391
            intersection = dict(dict1.items() & dict2.items())    in both
            union        = dict(dict1.items() | dict2.items())    in either
            difference   = dict(dict1.items() ^ dict2.items())    in only one or the other

            print(f'{len(intersection):>10} intersection')
            print(f'{len(       union):>10} union')
            print(f'{len(  difference):>10} difference')
    '''
    intersection = dict(any_ever.items() & any_never_published.items())
    if intersection:
        print(f'\n\n\n WARNING: Expected 0 intersection got {len(intersection)} {subrange(intersection.keys(), 6)} \n\n\n')

    union        = dict(any_ever.items() | any_never_published.items())
    if len(union) != len(list_range_ever_any):
        print(f'\n\n\n WARNING: union {len(union)} expected to equal range {len(list_range_ever_any)} {subrange(intersection.keys(), 6)} \n\n\n')

    print(f'                                           union: {len(union):>7}  {len(intersection)} intersection in any_ever v. any_never_published')

    # Sanity check for none in any_never_published also in any_ever and vice-versa
    in_both_never_and_ever_1 = [x for x in any_never_published if x in any_ever]
    in_both_never_and_ever_2 = [x for x in any_ever if x in any_never_published]

    if in_both_never_and_ever_1:
        print()
        print(f'    ======= {len(in_both_never_and_ever_1):>10} in_both_never_and_ever_1 {subrange(in_both_never_and_ever_1, 6)}')
        print()
    else:
        print('    in_both_never_and_ever_1 none, ok')
    if in_both_never_and_ever_2:
        print()
        print(f'    ======= {len(in_both_never_and_ever_2):>10} in_both_never_and_ever_2 {subrange(in_both_never_and_ever_2, 6)}')
        print()
    else:
        print('    in_both_never_and_ever_2 none, ok')
        print()

    if 896905 in set_of_never:
        print(f'  ------ {date_currently}   896905 in set_of_never')
        pause = 1
    else:
        print(f'  ------ {date_currently}   896905 NOT in set_of_never')
    if 896905 in any_ever:
        print(f'  ------ {date_currently}   896905 in any_ever')
        pause = 1
    else:
        print(f'  ------ {date_currently}   896905 NOT in any_ever')

    with open(file_never_published, 'w') as f:
        for x in set_of_never:
            f.write(f'{x}\n')

    with open(file_any_ever, 'w') as f:
        for x in list(any_ever.keys()):
            f.write(f'{x}\n')
    return

def run_all():
    print(__file__) ; print()
    print('run_all() ...') ; print()

    if os.path.exists(file_never_published):  Path(file_never_published).unlink()
    if os.path.exists(file_any_ever):         Path(file_any_ever).unlink()
    if os.path.exists(file_covid_ever):       Path(file_covid_ever).unlink()
    if os.path.exists(file_symptoms_deduped): Path(file_symptoms_deduped).unlink()

    validate_dirs_and_files()

    for this_drop_date in files['input'  ]['date']:
        print_date_banner(this_drop_date)
        stats_initialize (this_drop_date)  # per drop, then a totals row is calculated
        open_files       (this_drop_date)
        do_counts        (this_drop_date)

run_all()

exit()