Matching a list of names to files when the names are reversed

121 Views Asked by At

I have a .csv with a column called 'Name' listing individuals in Firstname MI. Lastname format:

  • Stephen A. Johnson.

And I have a folder full of files made by these individuals in lastnamefirstname format:

  • johnsonstephen_78938

Using the list of names I need to find the files that match and save those files to a separate folder.

What is the best way to do this?

2

There are 2 best solutions below

5
Vitalizzare On BEST ANSWER

Model

Let's prepare some space to model:

# necessary part
import pandas as pd
from pathlib import Path
from shutil import copy     # consider copy2 as alternative if metadata should be copied as well

# for demonstration purpose
from tempfile import TemporaryDirectory 
from os import listdir
from io import StringIO

data = '''
Stephen A. Johnson
Stephen B. Johnson
Albert J. Reiß
'''
csv_source = StringIO(data)

source = TemporaryDirectory(); src = Path(source.name)
destin = TemporaryDirectory(); dst = Path(destin.name)

filenames = [
    'johnsonstephen_30',
    'johnsonstephen_40',
    'reissalbert_50',     # should be copied with `casefold` and ignored with `lower`
    'gibberish_10',       # should be ignored
]

for file in filenames:
    (src/file).touch()

Now the main part. There's at least one pitfall: casefold versus lower when applied to the original names. This may influence on the file names selected and should be consciously resolved on your side. I use casefold and the name "Reiß" to demonstrate how it might appear.

# read the sequence of names
person = pd.read_csv(csv_source, header=None).squeeze()

# preparing name stems
ersatz = (
    person
    .str.casefold()     # not identical to `lower`, select one according to naming policy
    .str.split()
    .str[::-2]          # it won't work if the middle name doesn't exist.
    .str.join('')
    .unique()
)

# copying files of interest
for name in ersatz:
    for file in src.rglob(f'{name}_*'):
        copy(file, dst)     

Check the result and clean up the space:

print(f'Source:      {listdir(src)}')
print(f'Destination: {listdir(dst)}')

source.cleanup()
destin.cleanup()

Expected output:

Source:      ['gibberish_1234', 'johnsonstephen_40', 'johnsonstephen_30', 'reissalbert_50']
Destination: ['johnsonstephen_40', 'johnsonstephen_30', 'reissalbert_50']

P.S. Alternatively, when loading a csv file, we can interpret whitespace characters as a delimiter:

# read the sequence of names
person = pd.read_csv(
    csv_source, 
    header=None, 
    delim_whitespace=True, 
    usecols=[0,2],
    names=['firstname','lastname'],
)
ersatz = person['lastname'].str.casefold() + person['firstname'].str.casefold()

Real life

from pandas import read_csv
from pathlib import Path
from shutil import copy     

# declare csv-file and source/destination folders
source = 'path/to/folder/with/files'
destination = 'folder/to/copy/files/into'
csv_source = 'path/to/comma_separated_data_with_column_Name.csv'

# main script
src = Path(source)
dst = Path(destination)

person = read_csv(csv_source)['Name'].squeeze()
ersatz = (
    person
    .str.casefold()
    .str.split()
    .pipe(lambda seq: seq.str[-1] + seq.str[0])    # rely only on first and last name
    .unique()
)

for name in ersatz:
    for file in src.rglob(f'{name}_*'):
        copy(file, dst)
4
taller On

A code snippet demonstrates how to compare name in differece format.

nameStr = 'Stephen A. Johnson'.split()
nameFile = 'johnsonstephen_78938'

if ( (nameStr[-1]+nameStr[0]).lower() == nameFile.split("_")[0].lower() ):
  print('match')
else:
  print('mismatch')

Let's assume

  • csv file as below
Name,Age
Stephen A. Johnson,34
Mike Jackson,22
  • source file path: D:\temp\src
  • destination file path: D:\temp\dest
import pandas as pd
import pathlib as Path
import shutil as sh
path = Path.Path(r'D:\temp\namelist.csv')
file_path_src = Path.Path(r'D:\temp\src')
file_path_dest = Path.Path(r'D:\temp\dest')
files = {}
# iterate through the files in the source folder
for file in file_path_src.iterdir():
    # check the file name has "_"
    if file.name.count("_") == 1:
        sname = file.name.split("_")[0].lower()
        if sname in files:
            files[sname].append(file)
        else:
            files[sname] = [file]
print(files)
# load csv file
df = pd.read_csv(path, header=0)
# create destination folder if not exists
if not file_path_dest.exists():
    file_path_dest.mkdir()
# iterate through the names in the csv file
for name in df['Name'].tolist():
    name_arr = name.split(" ")
    namekey = (name_arr[-1]+name_arr[0]).lower()
    # chech if the name is in the files dict
    if namekey in files:
        for file in files[namekey]:
            print('Copying file: ', file )
            # copy the file to the destination folder
            sh.copy(file, file_path_dest)
print('Done')