Data Wrangling - Data Cleanup

 from csv import DictReader

data_rdr = DictReader(open('mn.csv''rt'))
header_rdr = DictReader(open('mn_headers.csv''rt'))
data_rows = [d for d in data_rdr]
header_rows = [h for h in header_rdr]
print (data_rows[:5])
print (header_rows[:5])

[{'': '1', 'HH1': '1', 'HH2': '17', 'LN': '1', 'MWM1': '1', 'MWM2': '17', 'MWM4': '1', 'MWM5': '14', 'MWM6D': '7', 'MWM6M': '4', 'MWM6Y': '2014', 'MWM7': 'Completed', 'MWM8': '2', 'MWM9': '20', 'MWM10H': '17', 'MWM10M': '59', 'MWM11H': '18', 'MWM11M': '7', 'MWB1M': '5', 'MWB1Y': '1984', 'MWB2': '29', 'MWB3': 'Yes', 'MWB4': 'Higher', 'MWB5': '31', 'MWB7': 'NA', 'MMT2': 'Almost every day', 'MMT3': 'At least once a week', 'MMT4': 'Less than once a week', 'MMT6': 'Yes', 'MMT7': 'Yes', 'MMT8': 'Almost every day', 'MMT9': 'Yes', 'MMT10': 'Yes', 'MMT11': 'Almost every day', 'MMT12': 'Yes', 'MMT13': 'Yes', 'MMT14': 'Almost every day', 'MCM1': 'No', 'MCM3': 'NA', 'MCM4': 'NA', 'MCM5A': 'NA', 'MCM5B': 'NA', 'MCM6': 'NA', 'MCM7A': 'NA', 'MCM7B': 'NA', 'MCM8': 'No', 'MCM9A': 'NA', 'MCM9B': 'NA', 'MCM10': '0', 'MCM11A': 'NA', 'MCM11B': 'NA', 'MCM12M': 'NA', 'MCM12Y': 'NA', 'MDV1A': 'No', 'MDV1B': 'No', 'MDV1C': 'No', 'MDV1D': 'No', 'MDV1E': 'No', 'MDV1F': 'No', 'MMA1': 'Yes, currently married', 'MMA3': 'No (Only one)', 'MMA4': 'NA', 'MMA5': 'NA', 'MMA6': 'NA', 'MMA7': 'Only once', 'MMA8M': '9', 'MMA8Y': '2013', 'MMA9': 'NA', 'MSB1': '20', 'MSB2': 'Yes', 'MSB3U': 'Days ago', 'MSB3N': '0', 'MSB4': 'No', 'MSB5': 'Wife', 'MSB8': 'No', 'MSB9': 'NA', 'MSB10': 'NA', 'MSB13': 'NA', 'MSB14': 'NA', 'MSB15': '5', 'MHA1': 'Yes', 'MHA2': 'Yes', 'MHA3': 'No', 'MHA4': 'Yes', 'MHA5': 'No', 'MHA6': 'No', 'MHA7': 'Yes', 'MHA8A': 'DK', 'MHA8B': 'Yes', 'MHA8C': 'DK', 'MHA9': 'Yes', 'MHA10': 'Yes', 'MHA11': 'No', 'MHA12': 'Yes', 'MHA24': 'Yes', 'MHA25': 'Less than 12 months ago', 'MHA26': 'Yes', 'MHA27': 'NA', 'MMC1': 'No', 'MMC2': 'NA', 'MMC3': 'NA', 'MMC4': 'NA', 'MTA1': 'No', 'MTA2': 'NA', 'MTA3': 'NA', 'MTA4': 'NA', 'MTA5': 'NA', 'MTA6': 'No', 'MTA7': 'NA', 'MTA8A': 'NA', 'MTA8B': 'NA', 'MTA8C': 'NA', 'MTA8D': 'NA', 'MTA8E': 'NA', 'MTA8X': 'NA', 'MTA9': 'NA', 'MTA10': 'No', 'MTA11': 'NA', 'MTA12A': 'NA', 'MTA12B': 'NA', 'MTA12C': 'NA', 'MTA12X': 'NA', 'MTA13': 'NA', 'MTA14': 'Yes', 'MTA15': '16', 'MTA16': '0', 'MTA17': 'NA', 'TNLN': 'NA', 'TN4': 'NA', 'TN5': 'NA', 'TN6': 'NA', 'TN8': 'NA', 'TN9': 'NA', 'TN10': 'NA', 'TN11': 'NA', 'TN12_1': 'NA', 'TN12_2': 'NA', 'TN12_3': 'NA', 'TN12_4': 'NA', 'HH6': 'Urban', 'HH7': 'Bulawayo', 'MWDOI': '1372', 'MWDOB': '1013', 'MWAGE': '25-29', 'MWDOM': '1365', 'MWAGEM': '29', 'MWDOBLC': 'NA', 'MMSTATUS': 'Currently married/in union', 'MCEB': '0', 'MCSURV': '0', 'MCDEAD': '0', 'mwelevel': 'Higher', 'mnweight': '0.403797141860459', 'wscore': '1.60367010204171', 'windex5': '5', 'wscoreu': '1.27255184167736', 'windex5u': '5', 'wscorer': 'NA', 'windex5r': 'NA'}, {'': '2', 'HH1': '1', 'HH2': '20', 'LN': '1', 'MWM1': '1', 'MWM2': '20', 'MWM4': '1', 'MWM5': '14', 'MWM6D': '7', 'MWM6M': '4', 'MWM6Y': '2014', 'MWM7': 'Completed', 'MWM8': '2', 'MWM9': '20', 'MWM10H': '17', 'MWM10M': '32', 'MWM11H': '17', 'MWM11M': '42', 'MWB1M': '5', 'MWB1Y': '1976', 'MWB2': '37', 'MWB3': 'Yes', 'MWB4': 'Higher', 'MWB5': '31', 'MWB7': 'NA', 'MMT2': 'At least once a week', 'MMT3': 'Not at all', 'MMT4': 'Almost every day', 'MMT6': 'Yes', 'MMT7': 'Yes', 'MMT8': 'Almost every day', 'MMT9': 'Yes', 'MMT10': 'Yes', 'MMT11': 'Almost every day', 'MMT12': 'Yes', 'MMT13': 'Yes', 'MMT14': 'Almost every day', 'MCM1': 'No', 'MCM3': 'NA', 'MCM4': 'NA', 'MCM5A': 'NA', 'MCM5B': 'NA', 'MCM6': 'NA', 'MCM7A': 'NA', 'MCM7B': 'NA', 'MCM8': 'No', 'MCM9A': 'NA', 'MCM9B': 'NA', 'MCM10': '0', 'MCM11A': 'NA', 'MCM11B': 'NA', 'MCM12M': 'NA', 'MCM12Y': 'NA', 'MDV1A': 'No', 'MDV1B': 'No', 'MDV1C': 'No', 'MDV1D': 'No', 'MDV1E': 'No', 'MDV1F': 'No', 'MMA1': 'Yes, currently married', 'MMA3': 'No (Only one)', 'MMA4': 'NA', 'MMA5': 'NA', 'MMA6': 'NA', 'MMA7': 'Only once', 'MMA8M': '2', 'MMA8Y': '2014', 'MMA9': 'NA', 'MSB1': '37', 'MSB2': 'No', 'MSB3U': 'Days ago', 'MSB3N': '0', 'MSB4': 'No', 'MSB5': 'Wife', 'MSB8': 'No', 'MSB9': 'NA', 'MSB10': 'NA', 'MSB13': 'NA', 'MSB14': 'NA', 'MSB15': '1', 'MHA1': 'Yes', 'MHA2': 'Yes', 'MHA3': 'No', 'MHA4': 'Yes', 'MHA5': 'No', 'MHA6': 'No', 'MHA7': 'Yes', 'MHA8A': 'No', 'MHA8B': 'Yes', 'MHA8C': 'Yes', 'MHA9': 'Yes', 'MHA10': 'Yes', 'MHA11': 'Yes', 'MHA12': 'No', 'MHA24': 'Yes', 'MHA25': 'Less than 12 months ago', 'MHA26': 'Yes', 'MHA27': 'NA', 'MMC1': 'No', 'MMC2': 'NA', 'MMC3': 'NA', 'MMC4': 'NA', 'MTA1': 'No', 'MTA2': 'NA', 'MTA3': 'NA', 'MTA4': 'NA', 'MTA5': 'NA', 'MTA6': 'No', 'MTA7': 'NA', 'MTA8A': 'NA', 'MTA8B': 'NA', 'MTA8C': 'NA', 'MTA8D': 'NA', 'MTA8E': 'NA', 'MTA8X': 'NA', 'MTA9': 'NA', 'MTA10': 'No', 'MTA11': 'NA', 'MTA12A': 'NA', 'MTA12B': 'NA', 'MTA12C': 'NA', 'MTA12X': 'NA', 'MTA13': 'NA', 'MTA14': 'No', 'MTA15': 'NA', 'MTA16': 'NA', 'MTA17': 'NA', 'TNLN': 'NA', 'TN4': 'NA', 'TN5': 'NA', 'TN6': 'NA', 'TN8': 'NA', 'TN9': 'NA', 'TN10': 'NA', 'TN11': 'NA', 'TN12_1': 'NA', 'TN12_2': 'NA', 'TN12_3': 'NA', 'TN12_4': 'NA', 'HH6': 'Urban', 'HH7': 'Bulawayo', 'MWDOI': '1372', 'MWDOB': '917', 'MWAGE': '35-39', 'MWDOM': '1370', 'MWAGEM': '37', 'MWDOBLC': 'NA', 'MMSTATUS': 'Currently married/in union', 'MCEB': '0', 'MCSURV': '0', 'MCDEAD': '0', 'mwelevel': 'Higher', 'mnweight': '0.403797141860459', 'wscore': '1.54327702631422', 'windex5': '5', 'wscoreu': '1.08902631982422', 'windex5u': '5', 'wscorer': 'NA', 'windex5r': 'NA'}, {'': '3', 'HH1': '2', 'HH2': '1', 'LN': '1', 'MWM1': '2', 'MWM2': '1', 'MWM4': '1', 'MWM5': '9', 'MWM6D': '8', 'MWM6M': '4', 'MWM6Y': '2014', 'MWM7': 'Completed', 'MWM8': '1', 'MWM9': '40', 'MWM10H': '10', 'MWM10M': '37', 'MWM11H': '10', 'MWM11M': '52', 'MWB1M': '2', 'MWB1Y': '1973', 'MWB2': '41', 'MWB3': 'Yes', 'MWB4': 'Primary', 'MWB5': '17', 'MWB7': 'Able to read whole sentence', 'MMT2': 'Not at all', 'MMT3': 'Almost every day', 'MMT4': 'Less than once a week', 'MMT6': 'No', 'MMT7': 'NA', 'MMT8': 'NA', 'MMT9': 'No', 'MMT10': 'NA', 'MMT11': 'NA', 'MMT12': 'Yes', 'MMT13': 'Yes', 'MMT14': 'Almost every day', 'MCM1': 'Yes', 'MCM3': '21', 'MCM4': 'Yes', 'MCM5A': '1', 'MCM5B': '1', 'MCM6': 'Yes', 'MCM7A': '1', 'MCM7B': '0', 'MCM8': 'No', 'MCM9A': 'NA', 'MCM9B': 'NA', 'MCM10': '3', 'MCM11A': 'No', 'MCM11B': '2', 'MCM12M': '5', 'MCM12Y': '2012', 'MDV1A': 'No', 'MDV1B': 'No', 'MDV1C': 'No', 'MDV1D': 'No', 'MDV1E': 'No', 'MDV1F': 'No', 'MMA1': 'Yes, currently married', 'MMA3': 'No (Only one)', 'MMA4': 'NA', 'MMA5': 'NA', 'MMA6': 'NA', 'MMA7': 'More than once', 'MMA8M': '8', 'MMA8Y': '1991', 'MMA9': 'NA', 'MSB1': '20', 'MSB2': 'No', 'MSB3U': 'Days ago', 'MSB3N': '0', 'MSB4': 'Yes', 'MSB5': 'Wife', 'MSB8': 'No', 'MSB9': 'NA', 'MSB10': 'NA', 'MSB13': 'NA', 'MSB14': 'NA', 'MSB15': '5', 'MHA1': 'Yes', 'MHA2': 'Yes', 'MHA3': 'No', 'MHA4': 'Yes', 'MHA5': 'No', 'MHA6': 'No', 'MHA7': 'Yes', 'MHA8A': 'No', 'MHA8B': 'Yes', 'MHA8C': 'No', 'MHA9': 'Yes', 'MHA10': 'Yes', 'MHA11': 'No', 'MHA12': 'Yes', 'MHA24': 'Yes', 'MHA25': '2 or more years ago', 'MHA26': 'Yes', 'MHA27': 'NA', 'MMC1': 'No', 'MMC2': 'NA', 'MMC3': 'NA', 'MMC4': 'NA', 'MTA1': 'Yes', 'MTA2': '18', 'MTA3': 'Yes', 'MTA4': '10', 'MTA5': '30', 'MTA6': 'Yes', 'MTA7': 'Yes', 'MTA8A': 'NA', 'MTA8B': 'NA', 'MTA8C': 'NA', 'MTA8D': 'NA', 'MTA8E': 'Rolled tobacco', 'MTA8X': 'NA', 'MTA9': '7', 'MTA10': 'No', 'MTA11': 'NA', 'MTA12A': 'NA', 'MTA12B': 'NA', 'MTA12C': 'NA', 'MTA12X': 'NA', 'MTA13': 'NA', 'MTA14': 'Yes', 'MTA15': '19', 'MTA16': '8', 'MTA17': '2', 'TNLN': 'NA', 'TN4': 'NA', 'TN5': 'NA', 'TN6': 'NA', 'TN8': 'NA', 'TN9': 'NA', 'TN10': 'NA', 'TN11': 'NA', 'TN12_1': 'NA', 'TN12_2': 'NA', 'TN12_3': 'NA', 'TN12_4': 'NA', 'HH6': 'Urban', 'HH7': 'Bulawayo', 'MWDOI': '1372', 'MWDOB': '878', 'MWAGE': '40-44', 'MWDOM': '1100', 'MWAGEM': '18', 'MWDOBLC': 'NA', 'MMSTATUS': 'Currently married/in union', 'MCEB': '3', 'MCSURV': '3', 'MCDEAD': '0', 'mwelevel': 'Primary', 'mnweight': '1.03192602919895', 'wscore': '0.878635263695964', 'windex5': '4', 'wscoreu': '-0.930720561098312', 'windex5u': '1', 'wscorer': 'NA', 'windex5r': 'NA'}, {'': '4', 'HH1': '2', 'HH2': '1', 'LN': '5', 'MWM1': '2', 'MWM2': '1', 'MWM4': '5', 'MWM5': '9', 'MWM6D': '12', 'MWM6M': '4', 'MWM6Y': '2014', 'MWM7': 'Not at home', 'MWM8': '1', 'MWM9': '40', 'MWM10H': 'NA', 'MWM10M': 'NA', 'MWM11H': 'NA', 'MWM11M': 'NA', 'MWB1M': 'NA', 'MWB1Y': 'NA', 'MWB2': 'NA', 'MWB3': 'NA', 'MWB4': 'NA', 'MWB5': 'NA', 'MWB7': 'NA', 'MMT2': 'NA', 'MMT3': 'NA', 'MMT4': 'NA', 'MMT6': 'NA', 'MMT7': 'NA', 'MMT8': 'NA', 'MMT9': 'NA', 'MMT10': 'NA', 'MMT11': 'NA', 'MMT12': 'NA', 'MMT13': 'NA', 'MMT14': 'NA', 'MCM1': 'NA', 'MCM3': 'NA', 'MCM4': 'NA', 'MCM5A': 'NA', 'MCM5B': 'NA', 'MCM6': 'NA', 'MCM7A': 'NA', 'MCM7B': 'NA', 'MCM8': 'NA', 'MCM9A': 'NA', 'MCM9B': 'NA', 'MCM10': 'NA', 'MCM11A': 'NA', 'MCM11B': 'NA', 'MCM12M': 'NA', 'MCM12Y': 'NA', 'MDV1A': 'NA', 'MDV1B': 'NA', 'MDV1C': 'NA', 'MDV1D': 'NA', 'MDV1E': 'NA', 'MDV1F': 'NA', 'MMA1': 'NA', 'MMA3': 'NA', 'MMA4': 'NA', 'MMA5': 'NA', 'MMA6': 'NA', 'MMA7': 'NA', 'MMA8M': 'NA', 'MMA8Y': 'NA', 'MMA9': 'NA', 'MSB1': 'NA', 'MSB2': 'NA', 'MSB3U': 'NA', 'MSB3N': 'NA', 'MSB4': 'NA', 'MSB5': 'NA', 'MSB8': 'NA', 'MSB9': 'NA', 'MSB10': 'NA', 'MSB13': 'NA', 'MSB14': 'NA', 'MSB15': 'NA', 'MHA1': 'NA', 'MHA2': 'NA', 'MHA3': 'NA', 'MHA4': 'NA', 'MHA5': 'NA', 'MHA6': 'NA', 'MHA7': 'NA', 'MHA8A': 'NA', 'MHA8B': 'NA', 'MHA8C': 'NA', 'MHA9': 'NA', 'MHA10': 'NA', 'MHA11': 'NA', 'MHA12': 'NA', 'MHA24': 'NA', 'MHA25': 'NA', 'MHA26': 'NA', 'MHA27': 'NA', 'MMC1': 'NA', 'MMC2': 'NA', 'MMC3': 'NA', 'MMC4': 'NA', 'MTA1': 'NA', 'MTA2': 'NA', 'MTA3': 'NA', 'MTA4': 'NA', 'MTA5': 'NA', 'MTA6': 'NA', 'MTA7': 'NA', 'MTA8A': 'NA', 'MTA8B': 'NA', 'MTA8C': 'NA', 'MTA8D': 'NA', 'MTA8E': 'NA', 'MTA8X': 'NA', 'MTA9': 'NA', 'MTA10': 'NA', 'MTA11': 'NA', 'MTA12A': 'NA', 'MTA12B': 'NA', 'MTA12C': 'NA', 'MTA12X': 'NA', 'MTA13': 'NA', 'MTA14': 'NA', 'MTA15': 'NA', 'MTA16': 'NA', 'MTA17': 'NA', 'TNLN': 'NA', 'TN4': 'NA', 'TN5': 'NA', 'TN6': 'NA', 'TN8': 'NA', 'TN9': 'NA', 'TN10': 'NA', 'TN11': 'NA', 'TN12_1': 'NA', 'TN12_2': 'NA', 'TN12_3': 'NA', 'TN12_4': 'NA', 'HH6': 'Urban', 'HH7': 'Bulawayo', 'MWDOI': '1372', 'MWDOB': 'NA', 'MWAGE': 'NA', 'MWDOM': 'NA', 'MWAGEM': 'NA', 'MWDOBLC': 'NA', 'MMSTATUS': 'NA', 'MCEB': 'NA', 'MCSURV': 'NA', 'MCDEAD': 'NA', 'mwelevel': 'NA', 'mnweight': '0', 'wscore': '0', 'windex5': '0', 'wscoreu': '0', 'windex5u': '0', 'wscorer': '0', 'windex5r': '0'}, {'': '5', 'HH1': '2', 'HH2': '1', 'LN': '8', 'MWM1': '2', 'MWM2': '1', 'MWM4': '8', 'MWM5': '9', 'MWM6D': '8', 'MWM6M': '4', 'MWM6Y': '2014', 'MWM7': 'Completed', 'MWM8': '1', 'MWM9': '40', 'MWM10H': '10', 'MWM10M': '53', 'MWM11H': '11', 'MWM11M': '10', 'MWB1M': '2', 'MWB1Y': '1993', 'MWB2': '21', 'MWB3': 'Yes', 'MWB4': 'Secondary', 'MWB5': '24', 'MWB7': 'NA', 'MMT2': 'Less than once a week', 'MMT3': 'At least once a week', 'MMT4': 'Less than once a week', 'MMT6': 'No', 'MMT7': 'NA', 'MMT8': 'NA', 'MMT9': 'No', 'MMT10': 'NA', 'MMT11': 'NA', 'MMT12': 'Yes', 'MMT13': 'Yes', 'MMT14': 'Almost every day', 'MCM1': 'No', 'MCM3': 'NA', 'MCM4': 'NA', 'MCM5A': 'NA', 'MCM5B': 'NA', 'MCM6': 'NA', 'MCM7A': 'NA', 'MCM7B': 'NA', 'MCM8': 'No', 'MCM9A': 'NA', 'MCM9B': 'NA', 'MCM10': '0', 'MCM11A': 'NA', 'MCM11B': 'NA', 'MCM12M': 'NA', 'MCM12Y': 'NA', 'MDV1A': 'No', 'MDV1B': 'No', 'MDV1C': 'No', 'MDV1D': 'No', 'MDV1E': 'No', 'MDV1F': 'No', 'MMA1': 'No, not in union', 'MMA3': 'NA', 'MMA4': 'NA', 'MMA5': 'No', 'MMA6': 'NA', 'MMA7': 'NA', 'MMA8M': 'NA', 'MMA8Y': 'NA', 'MMA9': 'NA', 'MSB1': '19', 'MSB2': 'Yes', 'MSB3U': 'Months ago', 'MSB3N': '7', 'MSB4': 'Yes', 'MSB5': 'Girlfriend', 'MSB8': 'No', 'MSB9': 'NA', 'MSB10': 'NA', 'MSB13': 'NA', 'MSB14': 'NA', 'MSB15': '3', 'MHA1': 'Yes', 'MHA2': 'Yes', 'MHA3': 'No', 'MHA4': 'Yes', 'MHA5': 'No', 'MHA6': 'No', 'MHA7': 'Yes', 'MHA8A': 'Yes', 'MHA8B': 'Yes', 'MHA8C': 'Yes', 'MHA9': 'Yes', 'MHA10': 'Yes', 'MHA11': 'No', 'MHA12': 'Yes', 'MHA24': 'Yes', 'MHA25': '12-23 months ago', 'MHA26': 'Yes', 'MHA27': 'NA', 'MMC1': 'No', 'MMC2': 'NA', 'MMC3': 'NA', 'MMC4': 'NA', 'MTA1': 'No', 'MTA2': 'NA', 'MTA3': 'NA', 'MTA4': 'NA', 'MTA5': 'NA', 'MTA6': 'No', 'MTA7': 'NA', 'MTA8A': 'NA', 'MTA8B': 'NA', 'MTA8C': 'NA', 'MTA8D': 'NA', 'MTA8E': 'NA', 'MTA8X': 'NA', 'MTA9': 'NA', 'MTA10': 'No', 'MTA11': 'NA', 'MTA12A': 'NA', 'MTA12B': 'NA', 'MTA12C': 'NA', 'MTA12X': 'NA', 'MTA13': 'NA', 'MTA14': 'Yes', 'MTA15': '20', 'MTA16': '10', 'MTA17': '2', 'TNLN': 'NA', 'TN4': 'NA', 'TN5': 'NA', 'TN6': 'NA', 'TN8': 'NA', 'TN9': 'NA', 'TN10': 'NA', 'TN11': 'NA', 'TN12_1': 'NA', 'TN12_2': 'NA', 'TN12_3': 'NA', 'TN12_4': 'NA', 'HH6': 'Urban', 'HH7': 'Bulawayo', 'MWDOI': '1372', 'MWDOB': '1118', 'MWAGE': '20-24', 'MWDOM': 'NA', 'MWAGEM': 'NA', 'MWDOBLC': 'NA', 'MMSTATUS': 'Never married/in union', 'MCEB': '0', 'MCSURV': '0', 'MCDEAD': '0', 'mwelevel': 'Secondary', 'mnweight': '1.03192602919895', 'wscore': '0.878635263695964', 'windex5': '4', 'wscoreu': '-0.930720561098312', 'windex5u': '1', 'wscorer': 'NA', 'windex5r': 'NA'}] [{'Name': 'HH1', 'Label': 'Cluster number', 'Question': ''}, {'Name': 'HH2', 'Label': 'Household number', 'Question': ''}, {'Name': 'LN', 'Label': 'Line number', 'Question': ''}, {'Name': 'MWM1', 'Label': 'Cluster number', 'Question': ''}, {'Name': 'MWM2', 'Label': 'Household number', 'Question': ''}]

for data_dict in data_rows:
  for dkey, dval in data_dict.items():
    for header_dict in header_rows:
      for hkey, hval in header_dict.items():
        if dkey == hval:
            print('match!')


match! match! match! match! match! match! match! match! match! match! match! match! match! match! match!
.....
new_rows = []
for data_dict in data_rows:
  new_row = {}
  for dkey, dval in data_dict.items():
    for header_dict in header_rows:
      if dkey in header_dict.values():
        new_row[header_dict.get('Label')] = dval
  new_rows.append(new_row)

new_rows[0]

'Relationship to last sexual partner': 'Wife', 'Sex with any other person in the last 12 month': 'No', 'Condom used with prior sexual partner': 'NA', 'Relationship to prior sexual partner': 'NA', 'Sex with any other man in the last 12 months': 'NA', 'Number of sex partners in last 12 months': 'NA', 'Number of sex partners in lifetime': '5', 'Ever heard of AIDS': 'Yes', 'Can avoid AIDS virus by having one uninfected partner': 'Yes', 'Can get AIDS virus through supernatural means': 'No', 'Can avoid AIDS virus by using a condom correctly every time': 'Yes', 'Can get AIDS virus from mosquito bites': 'No', 'Can get AIDS virus by sharing food with a person who has AIDS': 'No', 'Healthy-looking person may have AIDS virus': 'Yes', 'AIDS virus from mother to child during pregnancy': 'DK', 'AIDS virus from mother to child during delivery': 'Yes', 'AIDS virus from mother to child through breastfeeding': 'DK', 'Should female teacher with AIDS virus be allowed to teach in school': 'Yes', 'Would buy fresh vegetables from shopkeeper with AIDS virus': 'Yes', 'If HH member became infected with AIDS virus, would want it to remain a secret': 'No', 'Willing to care for person with AIDS in household': 'Yes', 'Ever been tested for AIDS virus': 'Yes', 'Most recent time of testing for AIDS virus': 'Less than 12 months ago', 'Received results of AIDS virus test': 'Yes', 'Know a place to get AIDS virus test': 'NA', 'Ever tried cigarette smoking': 'No', 'Age when cigarette was smoked for the first time': 'NA', 'Currently smoking cigarettes': 'NA', 'Number of cigarettes smoked in the last 24 hours': 'NA', 'Number of days when cigarettes were smoked in past month': 'NA', 'Ever tried any smoked tobacco products other than cigarettes': 'No', 'Used any smoked tobacco products during the last month': 'NA', 'Type of smoked tobacco product: Cigars': 'NA', 'Type of smoked tobacco product: Water pipe': 'NA', 'Type of smoked tobacco product: Cigarillos': 'NA', 'Type of smoked tobacco product: Pipe': 'NA', 'Type of smoked tobacco product: Other': 'NA', 'Number of days when tobacco products where smoked in past month': 'NA', 'Ever tried any form of smokeless tobacco products': 'No', 'Used any smokeless tobacco products during the last month': 'NA', 'Type of smokeless tobacco product used: Chewing tobacco': 'NA', 'Type of smokeless tobacco product used: Snuff': 'NA', 'Type of smokeless tobacco product used: Dip': 'NA', 'Type of smokeless tobacco product used: Other': 'NA', 'Number of days when smokeless tobacco products where used in past month': 'NA', 'Ever drunk alcohol': 'Yes', 'Age when alcohol was used for the first time': '16', 'Number of days when at least one drink of alcohol was used in past month': '0', 'Number of drinks usually consumed': 'NA', 'Months ago net obtained': 'NA', 'Net treated with an insecticide when obtained': 'NA', 'Net soaked or dipped since obtained': 'NA', 'Months ago net soaked or dipped': 'NA', 'Persons slept under mosquito net last night': 'NA', 'Person 1 who slept under net': 'NA', 'Person 2 who slept under net': 'NA', 'Person 3 who slept under net': 'NA', 'Person 4 who slept under net': 'NA'}


from csv import reader
data_rdr = reader(open('mn.csv''rt'))
header_rdr = reader(open('mn_headers.csv''rt'))
data_rows = [d for d in data_rdr]
header_rows = [h for h in header_rdr]
print (len(data_rows[0]))
print (len(header_rows))

159 210

data_rows[0]
['', 'HH1', 'HH2', 'LN', 'MWM1', 'MWM2', 'MWM4', 'MWM5', 'MWM6D', 'MWM6M', 'MWM6Y', 'MWM7', 'MWM8', 'MWM9', 'MWM10H', 'MWM10M', 'MWM11H', 'MWM11M', 'MWB1M', 'MWB1Y', 'MWB2', 'MWB3', 'MWB4', 'MWB5', 'MWB7', 'MMT2', 'MMT3', 'MMT4', 'MMT6', 'MMT7', 'MMT8', 'MMT9', 'MMT10', 'MMT11', 'MMT12', 'MMT13', 'MMT14', 'MCM1', 'MCM3', 'MCM4', 'MCM5A', 'MCM5B', 'MCM6', 'MCM7A', 'MCM7B', 'MCM8', 'MCM9A', 'MCM9B', 'MCM10', 'MCM11A', 'MCM11B', 'MCM12M', 'MCM12Y', 'MDV1A', 'MDV1B', 'MDV1C', 'MDV1D', 'MDV1E', 'MDV1F', 'MMA1', 'MMA3', 'MMA4', 'MMA5', 'MMA6', 'MMA7', 'MMA8M', 'MMA8Y', 'MMA9', 'MSB1', 'MSB2', 'MSB3U', 'MSB3N', 'MSB4', 'MSB5', 'MSB8', 'MSB9', 'MSB10', 'MSB13', 'MSB14', 'MSB15', 'MHA1', 'MHA2', 'MHA3', 'MHA4', 'MHA5', 'MHA6', 'MHA7', 'MHA8A', 'MHA8B', 'MHA8C', 'MHA9', 'MHA10', 'MHA11', 'MHA12', 'MHA24', 'MHA25', 'MHA26', 'MHA27', 'MMC1', 'MMC2', 'MMC3', 'MMC4', 'MTA1', 'MTA2', 'MTA3', 'MTA4', 'MTA5', 'MTA6', 'MTA7', 'MTA8A', 'MTA8B', 'MTA8C', 'MTA8D', 'MTA8E', 'MTA8X', 'MTA9', 'MTA10', 'MTA11', 'MTA12A', 'MTA12B', 'MTA12C', 'MTA12X', 'MTA13', 'MTA14', 'MTA15', 'MTA16', 'MTA17', 'TNLN', 'TN4', 'TN5', 'TN6', 'TN8', 'TN9', 'TN10', 'TN11', 'TN12_1', 'TN12_2', 'TN12_3', 'TN12_4', 'HH6', 'HH7', 'MWDOI', 'MWDOB', 'MWAGE', 'MWDOM', 'MWAGEM', 'MWDOBLC', 'MMSTATUS', 'MCEB', 'MCSURV', 'MCDEAD', 'mwelevel', 'mnweight', 'wscore', 'windex5', 'wscoreu', 'windex5u', 'wscorer', 'windex5r']

header_rows[:2]
[['HH1', 'Cluster number', ''], ['HH2', 'Household number', '']]
bad_rows = []
for h in header_rows:
  if h[0not in data_rows[0]:
    bad_rows.append(h)
for h in bad_rows:
  header_rows.remove(h)
print(len(header_rows))

150
all_short_headers = [h[0for h in header_rows]
for header in data_rows[0]:
  if header not in all_short_headers:
    print ('mismatch!', header)

mismatch! mismatch! MDV1F mismatch! MTA8E mismatch! mwelevel mismatch! mnweight mismatch! wscoreu mismatch! windex5u mismatch! wscorer mismatch! windex5r
from csv import reader
data_rdr = reader(open('mn.csv''rt'))
header_rdr = reader(open('mn_headers_updated.csv''rt'))
data_rows = [d for d in data_rdr]
header_rows = [h for h in header_rdr if h[0in data_rows[0]]
print(len(header_rows))
all_short_headers = [h[0for h in header_rows]
skip_index = []
for header in data_rows[0]:
  if header not in all_short_headers:
    index = data_rows[0].index(header)
    skip_index.append(index)
new_data = []
for row in data_rows[1:]:
  new_row = []
  for i, d in enumerate(row):
    if i not in skip_index:
      new_row.append(d)
  new_data.append(new_row)
zipped_data = []
for drow in new_data:
  zipped_data.append(zip(header_rows, drow))

152

zipped_data[0]
<zip at 0x7fa99615dc40>

data_headers = []
for i, header in enumerate(data_rows[0]):
  if i not in skip_index:
    data_headers.append(header)
header_match = zip(data_headers, all_short_headers)
print(header_match)

<zip object at 0x7fa9969f37c0>

from csv import reader
data_rdr = reader(open('mn.csv''rt',encoding='utf-8'))
header_rdr = reader(open('mn_headers_updated.csv''rt',encoding='utf-8'))
data_rows = [d for d in data_rdr]
header_rows = [h for h in header_rdr if h[0in data_rows[0]]
all_short_headers = [h[0for h in header_rows]
skip_index = []
final_header_rows = []
for header in data_rows[0]:
  if header not in all_short_headers:
    index = data_rows[0].index(header)
    skip_index.append(index)
  else:
    for head in header_rows:
      if head[0] == header:
        final_header_rows.append(head)
        break
new_data = []
for row in data_rows[1:]:
  new_row = []
  for i, d in enumerate(row):
    if i not in skip_index:
      new_row.append(d)
  new_data.append(new_row)
zipped_data = []
for drow in new_data:
  zipped_data.append(zip(final_header_rows, drow))

zipped_data[0]

<zip at 0x7fa9b5fbdc00>

Comments