Eliminate sub elements in a huge list of strings as long as no duplicates appear

77 Views Asked by At

I have a huge list of strings (~ 15000). Each string is unique in the list. All strings have words inside, which are separated by a dot. An algorithm shall do the following: The middle terms shall be eliminated, as long as no duplicates appear - middle term by middle term separately.

My code works only, if I have three elements in sum and this works fine. When I have more than one middle term, I have no idea how to handle each term separately - from left to right.

Any ideas?

import pandas as pd

def remove_middle_words(terms):
 df = pd.DataFrame({'terms': terms})
 df[['first_word', 'middle_word', 'last_word']] = df['terms'].str.split('.', expand=True)
 
 unique_first_last = df.groupby(['first_word', 'last_word']).size().reset_index().rename(columns={0:'count'})
 unique_first_last['remove_middle'] = unique_first_last['count'] == 1
 
 df = df.merge(unique_first_last[['first_word', 'last_word', 'remove_middle']], on=['first_word', 'last_word'], how='left')
 df['new_terms'] = df.apply(lambda row: row['terms'] if not row['remove_middle'] else f"{row['first_word']}.{row['last_word']}", axis=1)
 
 return df['new_terms'].tolist()
#case3+4 ok
terms = ['A.B3.C4', 'A.B3.C5', 'A.B4.C6', 'A.B5.C6']
new_terms = remove_middle_words(terms)
print(new_terms)

Examples:

Case1 (not ok with code below):

  • A.B1.C1.D1 --> A.D1
  • A.B1.C1.D2 --> A.D2 (both B1 and C1 could be eliminated)

Case2 (not ok with code below):

  • A.B2.C2.D3 --> A.C2.D3
  • A.B2.C3.D3 --> A.C3.D3 (only B2 could be eliminated, because if C2/C3 were eliminated, A.D3 were duplicates)

Case3 (ok with code below):

  • A.B3.C4 --> A.C4
  • A.B3.C5 --> A.C5 (B3 could be eliminated)

Case4 (ok with code below):

  • A.B4.C6
  • A.B5.C6 (nothing can be eliminated, because if B4/B5 were eliminated, A.C6 were duplicates)

Case5 (not ok with code below)

  • A.B10.C10.D1
  • A.B20.C10.D1
  • A.B20.C20.D1 (nothing can be eliminated, because if either Bx oder Cx are eliminated, they were duplicates A.D1)

Case6a (not ok with code below)

  • A.B100.C100.D100.D1 --> A.B100.D1
  • A.B200.C100.D100.D1 --> A.B200.D1 (C100 and D100 could be eliminated, with remaining of Bx the terms are unique)

Case6b (not ok with code below)

  • A.B300.C200.D100.D1 --> A.C200.D1
  • A.B300.C300.D100.D1 --> A.C300.D1 (B300 and D100 could be eliminated, with remaining of Cx the therms are unique)
1

There are 1 best solutions below

3
ASW22 On BEST ANSWER
import pandas as pd
import numpy as np

terms = ['A.B3.C4', 'A.B3.C5', 'A.B4.C6', 'A.B5.C6', 
         'A1.B1.C1.D1', 'A1.B1.C1.D2', "D1",
        "A.B10.C10.D1", "A.B20.C10.D1", "A.B20.C20.D1", 
        "A.B100.C100.D100.D1", "A.B200.C100.D100.D1",
        "A.B300.C200.D100.D1", "A.B300.C300.D100.D1",
        ""    
        ]

def rem_mid_words(txt1):
    if "." in txt1:
        l1 = txt1.split(".")
        txt2 = f"{l1[0]}.{l1[-1]}"   # pos 0 term & . & (-1 =) last term 
        return txt2
    else:
        pass  # string does not contain "."

term_count = {rem_mid_words(txt1): 0 for txt1 in terms}  # initial counting dictionary
for x in terms:
    key = rem_mid_words(x)
    term_count[key] += 1

new_terms = [rem_mid_words(txt1) 
             if term_count[rem_mid_words(txt1)] < 2
             else txt1 for txt1 in terms]
print(new_terms)


# start of adjusted answer

def rem_one_word(txt1, len1, pos):
    # Example "A1.B2.C3.D4.E5"
    # pos       0. 1. 2. 3. 4
    # length    5
    # if len1 is 5 and pos = 1 then remove B2 and return A1.C3.D4.E5
    # otherwise return original txt1
    
    if txt1 != None and "." in txt1:
        l1 = txt1.split(".")
        if len(l1) == len1:
            txt1 = ".".join(l1[:pos] + l1[(1+pos):])
    return txt1

def word_len(txt1):
    if "." in txt1:
        return len(txt1.split("."))
    else: return 0

def shorten_words(terms):   # input list of strings
    # get longest word in terms of "."
    # iterate by reducing length by one and replace if count < 2

    max_len = max([word_len(txt1) for txt1 in terms])
    terms1 = terms
    for len1 in np.arange(max_len, 2, -1):
        print("string length", len1)
        for pos in (1 + np.arange((len1)-2)):
            print("pos", pos)
            temp_words = [rem_one_word(txt1, len1, pos) for txt1 in terms1]
            temp_count = {x: 0 for x in temp_words}
            for x in temp_words: temp_count[x] += 1
            terms2 = [temp_words[i] if temp_count[temp_words[i]] < 2
                      else terms1[i] for i in range(len(terms1))]
            terms1 = terms2
    return terms1

terms1 = shorten_words(terms)

for i in range(len(terms)):
    print(terms[i], "   ", terms1[i])