#!python ''' Chopped up from flatfile code, just one-off for identifying never_published VAERS_IDs. Gary Hawkins http://univaers.com/download/ ''' from datetime import datetime from pathlib import Path import glob, os, sys, re, shutil, pprint, inspect import subprocess as sp import time as _time import pandas as pd import zipfile_deflate64 as zipfile # use ... pip install zipfile-deflate64 ... that solves it. date_floor = '' #'2022-11-11' # Useful any time for no redo of what's done date_ceiling = '' #'2021-05-15' #'2021-12-16' # In testing to stop if greater (newer) than this dir_top = '.' use_test_cases = 0 if use_test_cases: dir_top = 'z_test_cases' dir_input = f'{dir_top}/../Download/ALL_VAERS_DROPS' dir_compared = f'{dir_top}/vaers_full_compared' # changed from vaers_changes, resulting in a tad bit of cognitive confusion throughout the code now but overall better. dir_working = f'{dir_top}/vaers_working' dir_flattened = f'{dir_top}/vaers_flattened' dir_consolidated = f'{dir_top}/vaers_consolidated' file_stats = f'{dir_top}/stats.csv' file_never_published = f'{dir_top}/any_never_published.txt' # Even non-covid vax's, no possibility of knowing whether covid or other file_any_ever = f'{dir_top}/all_ever_seen.txt' # Even non-covid vax's file_covid_ever = f'{dir_top}/all_covid_seen.txt' # Only covid reports file_symptoms_deduped = f'{dir_top}/symptoms_deduped.txt' if use_test_cases: dir_input = f'{dir_top}/drops' tones = 0 floor_notice_printed = 0 ceiling_notice_printed = 0 covid_earliest_vaers_id = 890871 #896636 # was during trials but delayed, not published 2020-12-18 but later instead, gapfill elapsed_begin = _time.time() elapsed_drop = _time.time() df_data = pd.DataFrame() df_stats = pd.DataFrame() any_never_published = {} covid_ever = {} # grows, all covid VAERS_IDs ever identified any_ever = {} # Any vax files = {} dict_done_flag = {} stats = {} def validate_dirs_and_files(): ''' Create directories if not existing ''' global any_never_published, any_ever, covid_ever print() ; print('validate_dirs_and_files() ...') if not os.path.exists(dir_input): print(f'The expected inputs directory of CDC drops does not exist: {dir_input }') os.makedirs(dir_input ) print(' That directory has been created and should be populated with each data drop to process') print(' ... and containing the CDC csv or zip files') exit('\n see validate_dirs_and_files()') if not os.path.exists(dir_working): print(f' The expected working directory for processing does not exist, creating {dir_working}') os.makedirs(dir_working) if not os.path.exists(dir_consolidated): print(f' The expected consolidated directory for storing files does not exist, creating {dir_consolidated}') os.makedirs(dir_consolidated) if not os.path.exists(dir_flattened): print(f' The expected flattened directory for storing files does not exist, creating {dir_flattened}') os.makedirs(dir_flattened) if not os.path.exists(dir_compared): print(f' The expected output directory for changes does not exist, creating {dir_compared}') os.makedirs(dir_compared) if use_test_cases: # remove previous compared for filename in Path(dir_compared).glob("*.csv"): print(f'Delete {filename}') filename.unlink() # delete if os.path.exists(file_stats): Path(file_stats) .unlink() if os.path.exists(file_never_published): Path(file_never_published) .unlink() if os.path.exists(file_any_ever): Path(file_any_ever) .unlink() if os.path.exists(file_covid_ever): Path(file_covid_ever) .unlink() if os.path.exists(file_symptoms_deduped): Path(file_symptoms_deduped) .unlink() if os.path.exists(file_never_published): with open(file_never_published, 'r') as f: lines = f.readlines() lines = [x.strip() for x in lines] lines = [ int(x) for x in lines if x ] any_never_published = {x:1 for x in lines} else: with open(file_never_published, 'w'): # create it empty pass if os.path.exists(file_any_ever): with open(file_any_ever, 'r') as f: lines = f.readlines() lines = [x.strip() for x in lines] lines = [ int(x) for x in lines if x ] any_ever = {x:1 for x in lines} else: with open(file_any_ever, 'w'): # create it empty pass if os.path.exists(file_covid_ever): with open(file_covid_ever, 'r') as f: lines = f.readlines() lines = [x.strip() for x in lines] lines = [ int(x) for x in lines if x ] covid_ever = {x:1 for x in lines} else: with open(file_covid_ever, 'w'): # create it empty pass files_populate_information() if not files['input']['files']: exit(f" No csv or zip files in dir_input {dir_input}, no point in continuing") print() print(f"{(len(files['input']['date']) - len(files['changes']['date'])):>10} drops in input to process") ; print() print(f" First (oldest) input: {files['input']['files'][0]}") print(f" Last (newest) input: {files['input']['files'][-1]}") files_changes = sorted(files['changes']['files']) date_changes = sorted(files['changes']['date' ]) date_input = sorted(files['input' ]['date' ]) if files_changes: if date_input[-1] <= date_changes[-1]: print() if len(files_changes) >= 2: print(f" Second-to-last changes file is {files_changes[-2]}") if len(files_changes) >= 1: print(f" Last (newest) changes file is {files_changes[-1]}") print() ; print(f' Already processed files do appear in {dir_compared} and the latest will be built upon:') count = 0 for f in files_changes: # showing some of those print(f' {f}') count += 1 if count >= 5: print(f' ... {len(files_changes)} total') break print() else: ''' First covid 2020-12-25 treatment initially or other with date_floor ''' print() ; print(f' No processed files show up in {dir_compared}. Is the first in the loop.') return def files_from_zip(zip_file, dir_dst): ''' This requires ... pip install zipfile-deflate64 ... to handle zip straight from https://vaers.hhs.gov/data/datasets.html See https://stackoverflow.com/a/73040025/962391 The alternative is to unzip and rezip to get away from compression type 9 (deflate64), a licensing issue ''' archive = zipfile.ZipFile(zip_file) print(f' unzip {zip_file}') for file in archive.namelist(): # only 2020... and NonDomestic files if file.startswith('202') or file.lower().startswith('nond'): archive.extract(file, './' + dir_dst) def files_populate_information(): ''' Often updating in 'files' variable ''' global floor_notice_printed, ceiling_notice_printed if not files: # make the keys for x in ['input', 'working', 'flattened', 'changes', 'consolidated']: files[x] = {} for y in ['date', 'files']: files[x][y] = [] # set _dir if x == 'input': files[x]['_dir'] = dir_input elif x == 'working': files[x]['_dir'] = dir_working elif x == 'changes': files[x]['_dir'] = dir_compared elif x == 'flattened': files[x]['_dir'] = dir_flattened elif x == 'consolidated': files[x]['_dir'] = dir_consolidated # current values for thing in list(files.keys()): _dir = files[thing]['_dir'] # filenames only and made lowercase) full = sorted( [y for x in os.walk(_dir) for y in glob.glob(os.path.join(x[0], '*' + '.*'))] ) # note other files/dirs can be there without a problem, only .csv or .zip are picked up full = [x for x in full if re.search(r'\\\d{4}\-\d{2}\-\d{2}', x)] # file must start with a date like 2020-12-24 full = [linux_path(x) for x in full] full = [x for x in full if (x.lower().endswith('.csv') or x.lower().endswith('.zip'))] full = [x for x in full if not (x.lower().endswith('_a.csv') or x.lower().endswith('_b.csv'))] # date only like 2020-12-24 files[thing]['date'] = sorted( set( [date_from_filename(x) for x in full] ) ) # uniquing in the case of test cases CSV inputs # date to either the zip file or directory name files[thing]['keyval'] = {date_from_filename(x) : x for x in full} files[thing]['valkey'] = {x : date_from_filename(x) for x in full} files[thing]['files' ] = list(files[thing]['valkey'].keys()) # Hack for testing when input files are only flattened files rather than in drops dir. if use_test_cases: files[ 'input' ] = files[ 'flattened' ] do_file_limits = 0 if date_floor: do_file_limits = 1 if not floor_notice_printed: print(f'\n\n\n\t\t date_floor is set at { date_floor}, limiting files\n\n') floor_notice_printed = 1 if date_ceiling: do_file_limits = 1 if not ceiling_notice_printed: print(f'\n\n\n\t\t date_ceiling is set at {date_ceiling}, limiting files\n\n') ceiling_notice_printed = 1 if do_file_limits: # remove those that don't apply if date_floor: files['input']['date'] = [x for x in files['input']['date'] if x >= date_floor] if date_ceiling: files['input']['date'] = [x for x in files['input']['date'] if x <= date_ceiling] for y in ['input']: # only this files[y]['date'] = [x for x in files[y]['date'] if x in files['input']['date']] files[y]['keyval'] = {k:v for k, v in files[y]['keyval'].items() if k in files['input']['date']} files[y]['valkey'] = {k:v for k, v in files[y]['valkey'].items() if v in files['input']['date']} files[y]['files' ] = list(files[y]['valkey'].keys()) #pp.pprint(files) return def open_file_to_df(filename, doprint=1): ''' Read CSV filename into dataframe df ''' df = 'see open_file_to_df()' try: if doprint: print(f' open {filename:>54}', flush=True, end='') with open(filename, encoding='utf-8-sig', errors='replace') as f: # 'utf-8-sig' and 'ISO-8859-1', need to resolve this df = pd.read_csv(f, index_col=None, header=0, sep=',', engine='python', encoding='ISO-8859-1').fillna('') if doprint: max_vid = 'ok' if 'VAERS_ID' in df.columns: max_vid = f'Highest VAERS_ID {df.VAERS_ID.astype(int).max():>7}' print(f' ... {max_vid} {len(df):>7} rows') except ValueError as e: print(f'\n\t{e}') df = types_set(df) warn_mixed_types(df) return df def open_files(_date): # like './vaers_drop_inputs/2020-12-25' ''' Input files in dir_input: csv within directories zip files in a single directory, containing csv, treated as if they are folders, sort of. ''' files_populate_information() if _date in files['consolidated']['date']: # already consolidated print(f' {_date} already consolidated, no need to copy input files to dir_working') shutil.rmtree(dir_working) # removing directory os.mkdir(dir_working) set_files_date_marker(dir_working, _date) # a flag used by consolidate() return if _date in files['flattened']['date']: print(f' Skipping unzip because flattened for {_date} already exists') return if not _date in files['input']['keyval']: exit(f" Failed to find in files['input']['keyval'] the _date {_date} in open_files() ") files_value = files['input']['keyval'][_date] if 'csv' in files_value: print(f' Copy all {_date} to {dir_working}') to_copy = [x for x in files['input']['files'] if _date in x] shutil.rmtree(dir_working) os.mkdir(dir_working) for x in to_copy: shutil.copy(x, dir_working) elif isinstance(files_value, list): # another case earlier in development, csv files already extracted manually print(f' Copy {_date}/* to {dir_working}') shutil.rmtree(dir_working) # removing directory to avoid error next line shutil.copytree(_date, dir_working) set_files_date_marker(dir_working, _date) elif 'zip' in files_value: # zip file, treat it sort of like a directory here shutil.rmtree(dir_working) os.makedirs(dir_working) files_from_zip(files_value, dir_working) else: exit(f' Unexpected _date {_date} in open_files() ') set_files_date_marker(dir_working, _date) return def warn_mixed_types(df): if df is None or (not len(df)): return len_types_unique = len(df.applymap(type).drop_duplicates()) if len_types_unique > 1: print(f'\n\nline {inspect.stack()[ 1 ][ 2 ]} MIXED TYPES: {len_types_unique}') for col in df: list_of_types_complex = df[[col]].applymap(type).drop_duplicates().values.astype(str).tolist() if len( list_of_types_complex ) > 1: types_simple = [] for x in list_of_types_complex: if 'int' in str(x): types_simple.append('int') elif 'str' in str(x): types_simple.append('str') elif 'float' in str(x): types_simple.append('float') elif 'obj' in str(x): types_simple.append('obj') else: if x not in types_simple: types_simple.append(x) print(f' {col} {types_simple}') print('\n\n') def types_set(df): ''' FIX move deleted to their cell VAERS_ID cell_edits status changes VAX_TYPE VAX_MANU VAX_LOT VAX_DOSE_SERIES VAX_ROUTE VAX_SITE VAX_NAME RECVDATE STATE AGE_YRS CAGE_YR CAGE_MO SEX RPT_DATE SYMPTOM_TEXT DIED DATEDIED L_THREAT ER_VISIT HOSPITAL BIRTH_DEFECT OFC_VISIT ER_ED_VISIT HOSPDAYS X_STAY DISABLE RECOVD VAX_DATE ONSET_DATE NUMDAYS LAB_DATA V_ADMINBY V_FUNDBY OTHER_MEDS CUR_ILL HISTORY PRIOR_VAX SPLTTYPE FORM_VERS TODAYS_DATE ALLERGIES symptom_entries ''' if 'VAERS_ID' not in df.columns: return df # skip stats.csv etc if 'gapfill' in df.columns: return df # skip stats.csv etc df = df.copy() df = df.fillna('') df = df.astype(str) # all columns as string, then fix some for col in ['VAERS_ID', 'HOSPDAYS', 'NUMDAYS']: if col in df.columns: df = make_numeric(df, col) df[col] = df.loc[(~df[col].isna()) & df[col].ne(''), col].astype('float64').astype(int) # .astype(int) in pandas doesn't work, they remain float for col in ['AGE_YRS', 'CAGE_YR', 'CAGE_MO']: if col in df.columns: df = make_numeric(df, col) if 'AGE_YRS' in df.columns: # a way of determining only data, not vax or syms df's for col in ['status', 'changes']: # Initializing everything for the time-being for simplicity, and perhaps sanity if col not in df.columns: df[col] = '' del col if 'cell_edits' in df.columns: df.loc[ (df[ 'cell_edits' ].isna()) | df[ 'cell_edits' ].eq(''), 'cell_edits' ] = int(0) df['cell_edits'] = df['cell_edits'].astype(int) # TODO: Why needed? invalid literal for int() with base 10: '0.0' else: df['cell_edits'] = int(0) return df def set_files_date_marker(_dir, filename): ''' For visual clarity create an empty file in dir with filename as the particular date ''' file_date_part = date_from_filename(filename) pattern_dir = _dir + '/' if file_date_part: files_date_marker = pattern_dir + file_date_part if os.path.exists(files_date_marker): return else: print(f' Creating in {pattern_dir} date marker file {file_date_part}') open(files_date_marker, 'a').close() else: print(f' FAILED creating in {pattern_dir} date marker file {file_date_part}') return('EMPTY IN set_files_date_marker, must fix if hit') def get_files_date_marker(_dir): ''' Date from filename for files in director ''' pattern_dir = './' + _dir + '/' files_with_date = glob.glob(pattern_dir + '*-*') if not files_with_date: return('') files_with_date = files_with_date[0] # any will do, thus first in list returned (even if just one of course) file_date_part = date_from_filename(files_with_date) if file_date_part: return file_date_part else: print(' EMPTY in get_files_date_marker') return def date_from_filename(filename): ''' Pull just date portion of a filename ''' return re.sub(r'.*(\d{4}\-\d{2}\-\d{2}).*', r'\1', filename) def files_concat(files_list): ''' Join/concatenate files ''' df_out = None count = 0 for filename in files_list: df = open_file_to_df(filename, doprint=1) if count == 0: df_out = df.copy() else: df_out = pd.concat([df_out.reset_index(drop=True), df.reset_index(drop=True) ], ignore_index=True) count += 1 df_out = df_out.fillna('') # There are nans in all NonDomestic, why? ''' VAERS_IDs inconsistently have a leading zero or not, fixing that, uniformity. This is important for removal of duplicates for example. Making double sure they are integers. ''' df_out['VAERS_ID'] = pd.to_numeric(df_out.VAERS_ID) ''' It's true, CDC publishes files with some full, exact, complete duplicates. Why? An example is 896795 in 2020-12-18 NonDomesticVAERSVAX.csv Earlier note: For debug but tricky as df_dupes are not all dupes but merely contain one that is: df_duplicated = df_out.loc[df_out.duplicated(subset=df_out.columns) == True].sort_values(by='VAERS_ID') # each record only once though vids_duplicated = df_duplicated.VAERS_ID.to_list() df_dupes = df_out.loc[ df_out.VAERS_ID.isin(vids_duplicated) ] # to examine them ''' len_before = len(df_out) #df_out_ori = df_out.copy() df_out = df_out.drop_duplicates(df_out.columns).reset_index(drop=True) # must avoid dropping on default by index, use of df_out.columns if len(df_out) - len_before: print(f'{(len_before - len(df_out)):>30} exact duplicates dropped in concatenated files, now {len(df_out.VAERS_ID.to_list())} VAERS_IDs') return df_out def linux_path(x): return re.sub(r'\\', '/', x) def subrange(list_in, _max): ''' Input list and for print return up to _max (like 5) at the start and end of that list ''' if len(list_in) == 0: return '' # i.e. not empty square brackets if len(list_in) <= _max: return list_in list_in = sorted(list_in) this_many = int(max(min(len(list_in) / 2, _max), 1)) head = f'{list_in[:this_many]}' if len(list_in) > 1 else '' head = re.sub(r'\]', '', head) tail = f'{list_in[-this_many:]}' if len(list_in) > 1 else '' tail = re.sub(r'\[', '', tail) return(f'{len(list_in):>7} {head} ... {tail}') def single_plural(count, word): if count == 1: return re.sub(r'.$', '', word) # removing 's' on the end if just 1 else: return word def line(): ''' Print line number (debug) ''' caller = inspect.getframeinfo(inspect.stack()[1][0]) print(f' Line {caller.lineno} in {caller.function}()') def tone(): ''' Utility sounding a tone. For example at assign_outliers() main work starting or end of a run ''' if not tones: return print('\a', end='') # sounds a tone def do_elapsed(marker_in): ''' Calculate elapsed time for a given input marker as the starting point. ''' tone() elapsd = (_time.time() - marker_in) / 60 # Minutes return( f"{'{} hr {} min'.format(int(elapsd / 60), '%.1f' % (elapsd % 60))}" ) def exit(_in=None): ''' Exit with message ''' if not _in: _in = '' print() ; print(f'{_in}') do_elapsed(elapsed_begin) print() ; print(f'Done with {__file__} at line {inspect.stack()[1][2]}, {str(datetime.now())}') # clock time print() ; print('- - - - - - - - - - - - - - - - - - - - - - - - ') tone() os._exit(0) def make_numeric(df_in, col): if not len(df_in): return df_in df_in = df_in.copy() df_in[col] = df_in[col].fillna('') df_in[col] = df_in[col].astype(str) df_in[col] = pd.to_numeric(df_in[col], errors='ignore') # Desperation, number columns usually should be empty when 0, setting to 0.0 during run, empty them later df_in.loc[ (df_in[ col ].isna()), col ] = 0.0 df_in[ col ] = df_in[ col ].astype('float64').round(4) if 'nan' in df_in[col].to_list(): print(f'is_nan in make_numeric()') return df_in def move_rows(df_subset, df_move_from, df_move_to): ''' Move rows in df_subset out of df_move_from into df_move_to Return the new df_move_from and df_move_to ''' if not len(df_subset): return df_move_from, df_move_to df_move_to = df_move_to.copy() df_move_to = pd.concat([df_move_to.reset_index(drop=True), df_subset.reset_index(drop=True) ], ignore_index=True) df_move_from = df_move_from.loc[ ~df_move_from.VAERS_ID.isin( df_move_to.VAERS_ID ) ] # everything from before, not now in done return df_move_from, df_move_to def move_column_forward(df_in, column): ''' Reorder columns moving column to second ''' if column not in df_in: return df_in columns_pre = list(df_in.columns) if columns_pre[1] == column: return df_in # already in that spot if 'VAERS_ID' not in df_in.columns: col_order = [column] # start for col in df_in.columns: if col == column: continue col_order.append(col) return df_in[col_order] #df_in.reindex(col_order, axis=1) col_order = ['VAERS_ID', column] for c in columns_pre: if c not in col_order: col_order.append(c) return df_in.reindex(col_order, axis=1) def print_date_banner(this_drop_date): print() print('= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = ') print(f' Next date {this_drop_date}') print('= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = ') print() def stats_initialize(date_currently): ''' Per drop, stored in stats dictionary, then written to stats.csv with each date as a row. ''' global stats stats = { 'date' : date_currently, 'drop_input_covid' : 0, 'comparisons' : 0, # Number of drops. 'new' : 0, # New this drop. 'deleted' : 0, # Were visible, now gone. 'restored' : 0, # Had been deleted. 'gapfill' : 0, # Held back but finally showed up. 'never_published' : 0, # Total still being held back. Gaps in the VAERS_ID sequence. 'any_ever' : 0, # Total any jab VAERS_ID's ever seen. 'covid_ever' : 0, # Total covid VAERS_ID's ever seen. } def do_counts(files_date_marker): global covid_ever, any_ever, stats print() ; print(f' do_counts') files_populate_information() df_data_all = files_concat( glob.glob(dir_working + '/' + '*VAERSDATA.csv' ) ) print() ; print() ; # Checking something if 896905 in df_data_all.VAERS_ID.to_list(): print(f' ------ {files_date_marker} 896905 in df_data_all.VAERS_ID.to_list()') pause=1 else: print(f' ------ {files_date_marker} 896905 NOT in df_data_all.VAERS_ID.to_list()') len_before = len(df_data_all) df_data_all = df_data_all.loc[ df_data_all.VAERS_ID >= covid_earliest_vaers_id ] print() ; print(f'{len_before - len(df_data_all):>10} records removed prior to the first covid report (covid_earliest_vaers_id {covid_earliest_vaers_id})') print(f'{len(df_data_all):>10} any vax reports to work with (unique VAERS_IDs)') ; print() vids_list = df_data_all.VAERS_ID.to_list() lo_data_all = min(vids_list) hi_data_all = max(vids_list) diff_data_all = hi_data_all - lo_data_all len_data_all = len(vids_list) missing = diff_data_all - len_data_all print(f'{missing:>10} missing (any/all vax never published in covid era) is implied by only {len_data_all} present in {diff_data_all} range with lo {lo_data_all} and hi {hi_data_all}') print() dict_vids_list = {x:1 for x in vids_list} #vids_new = [ x for x in vids_list if x not in any_ever.keys() ] vids_new = [ x for x in vids_list if x not in any_ever ] print(f'{len(vids_new):>10} being added to `any_ever` for this drop, any/all vax') any_ever.update({x:1 for x in vids_list}) #any_ever = {**any_ever, **{x:1 for x in vids_list}} #any_ever = {x:1 for x in set(any_ever.keys() + vids_list)} len_before = len(df_data_all) df_data_all = df_data_all.drop_duplicates(subset='VAERS_ID') if len(df_data_all) - len_before: print(f'{(len_before - len(df_data_all)):>10} duplicates dropped in df_data_all on VAERS_IDs') print(f'{len(set(df_data_all.VAERS_ID.to_list())):>10} ALL/ANY reports to work with') ; print() # Grab all VAERS_IDs before filtering for covid, to be able to identify gaps properly do_never_ever( vids_list, files_date_marker, 'do_counts on df_data_all' ) stats['drop_input_covid'] = len(df_data_all) print(f' do_counts of {files_date_marker} done') return def do_never_ever(vids_present, date_currently, source): ''' Reports never published, identifying gaps in VAERS_ID sequence. Required to keep all VAERS_IDs ever seen so far, doing so as keys in dictionary. Many of these can be non-covid also. Use of dictionaries fast, lists slow. any_ever file was late on the scene and maybe all of the drop stuff is no longer needed, just high, lo, range (sequence) and gaps. ''' global any_never_published, any_ever, stats ''' There's surely some crude logic here that could be fixed ''' lo_this_drop = min(vids_present) hi_this_drop = max(vids_present) highest_ever = hi_this_drop if any_never_published and any_ever: lo_ever = min( min(any_never_published), min(any_ever), lo_this_drop ) highest_ever = max( max(any_never_published), max(any_ever), hi_this_drop ) else: lo_ever = lo_this_drop list_range_ever_any = sorted(range(lo_ever , highest_ever + 1)) # range() is non-inclusive at the top end list_range_drop_only = sorted(range(lo_this_drop, hi_this_drop + 1)) # Patch b/c I'm missing 2021-01-01 drop, these were published but later deleted. Some others may have been deleted since this writing, being missed now, from that date. patch = [ 905000, 905553, 907988, 907989, 908265, 908266, 908269, 908279, 909030, 909617, 910294, 910295, 910297, 910298, 910300, 910301, 910303, 910304, 910305, 910312, 910313, 910575, 910577, 910594, 910601, 910617, 910646, 913144 ] if 'never_published_patch_done' not in dict_done_flag: dict_done_flag['never_published_patch_done'] = 0 # init thing if use_test_cases: dict_done_flag['never_published_patch_done'] = 1 if (not dict_done_flag['never_published_patch_done']) and (hi_this_drop > min(patch)): # any case except if in contiguous run restart and not yet at the missing 2021-01-01 drop if date_currently > '2021-01-01': # This new year's drop might have been skipped by CDC, don't know. dict_patch = {x:1 for x in patch} any_ever.update(dict_patch) #any_ever = {x:1 for x in set(any_ever.keys() + dict_patch.keys())} #any_ever = {**{x:1 for x in patch}, **any_ever} # adding these few, assuming them published for simplicity dict_done_flag['never_published_patch_done'] = 1 # global print(f'{len(patch):>10} for missing 2021-01-01 drop patch added to any_ever') dict_vids_present = {x:1 for x in vids_present} any_ever.update(dict_vids_present) #any_ever = {**any_ever, **dict_vids_present} #any_ever = {x:1 for x in set(any_ever.keys() + dict_vids_present.keys())} set_of_gap_fills = set( set(vids_present) & any_never_published.keys() ) gaps_new = set(list_range_drop_only) - set(dict_vids_present.keys()) # any in list_range_drop_only that are not in dict_vids_present set_of_never = set(list_range_ever_any) - set(any_ever) # any in list_range_ever_any that are not in any_ever (expected in range but never ever seen) set_of_never = set_of_never - set(any_ever) # remove any in never if in any_ever making sure any_never_published = {x:1 for x in sorted(set_of_never)} hi_any_never_published = max(set_of_never) print() print(f' From {source}:') print(f' hi_any_never_published {hi_any_never_published}') print(f' lo_ever covid {lo_ever} covid_earliest_vaers_id') print(f' lo_this_drop covid {lo_this_drop}') print(f' hi_this_drop covid {hi_this_drop}') print(f' vids_present covid {subrange(dict_vids_present, 6)}') print(f' gaps_filled covid {subrange(set_of_gap_fills, 6)}') print(f' gaps_new any {subrange(gaps_new, 6)}') print(f' any_never_published {subrange(set_of_never, 6)}') print(f' list_range_ever_any {subrange(list_range_ever_any, 6)}') print(f' VAERS_IDs any: {lo_ever:>7} to {max(list_range_ever_any):>7} expected: {len(list_range_ever_any):>7} any_ever: {len(any_ever)} any_never_published: {len(set_of_never)}') ''' Sanity check. All VAERS_IDs must be in ONLY either any_ever or any_never_published In Python 3, you can use https://stackoverflow.com/a/49710152/962391 intersection = dict(dict1.items() & dict2.items()) in both union = dict(dict1.items() | dict2.items()) in either difference = dict(dict1.items() ^ dict2.items()) in only one or the other print(f'{len(intersection):>10} intersection') print(f'{len( union):>10} union') print(f'{len( difference):>10} difference') ''' intersection = dict(any_ever.items() & any_never_published.items()) if intersection: print(f'\n\n\n WARNING: Expected 0 intersection got {len(intersection)} {subrange(intersection.keys(), 6)} \n\n\n') union = dict(any_ever.items() | any_never_published.items()) if len(union) != len(list_range_ever_any): print(f'\n\n\n WARNING: union {len(union)} expected to equal range {len(list_range_ever_any)} {subrange(intersection.keys(), 6)} \n\n\n') print(f' union: {len(union):>7} {len(intersection)} intersection in any_ever v. any_never_published') # Sanity check for none in any_never_published also in any_ever and vice-versa in_both_never_and_ever_1 = [x for x in any_never_published if x in any_ever] in_both_never_and_ever_2 = [x for x in any_ever if x in any_never_published] if in_both_never_and_ever_1: print() print(f' ======= {len(in_both_never_and_ever_1):>10} in_both_never_and_ever_1 {subrange(in_both_never_and_ever_1, 6)}') print() else: print(' in_both_never_and_ever_1 none, ok') if in_both_never_and_ever_2: print() print(f' ======= {len(in_both_never_and_ever_2):>10} in_both_never_and_ever_2 {subrange(in_both_never_and_ever_2, 6)}') print() else: print(' in_both_never_and_ever_2 none, ok') print() if 896905 in set_of_never: print(f' ------ {date_currently} 896905 in set_of_never') pause = 1 else: print(f' ------ {date_currently} 896905 NOT in set_of_never') if 896905 in any_ever: print(f' ------ {date_currently} 896905 in any_ever') pause = 1 else: print(f' ------ {date_currently} 896905 NOT in any_ever') with open(file_never_published, 'w') as f: for x in set_of_never: f.write(f'{x}\n') with open(file_any_ever, 'w') as f: for x in list(any_ever.keys()): f.write(f'{x}\n') return def run_all(): print(__file__) ; print() print('run_all() ...') ; print() if os.path.exists(file_never_published): Path(file_never_published).unlink() if os.path.exists(file_any_ever): Path(file_any_ever).unlink() if os.path.exists(file_covid_ever): Path(file_covid_ever).unlink() if os.path.exists(file_symptoms_deduped): Path(file_symptoms_deduped).unlink() validate_dirs_and_files() for this_drop_date in files['input' ]['date']: print_date_banner(this_drop_date) stats_initialize (this_drop_date) # per drop, then a totals row is calculated open_files (this_drop_date) do_counts (this_drop_date) run_all() exit()