The Output of this python code is not what I am expecting

110 Views Asked by At

I'm a new CE student and I wanted to make a python program that read the input from a text file and give me the output in a text file. in this program I want it to take the input line by line and give me the frequency of its contents. but its not accurate and has a messy output. I was hoping you can help me.

this is the output it gives me :

 'Kidney Beans': 5
 'Onion': 4
: 4
['Milk': 3
 'Yogurt']: 3
 'Kidney Beans'  'Yogurt']: 3
  'Yogurt']: 3

this is the output I want :

Kidney Beans: 5
Onion: 4
Eggs: 4
Yogurt: 3
Yogurt, Kidney Beans: 3
Milk: 3
Kidney Beans, Milk: 3

this is my python code :


class TreeNode:
    def __init__(self, name, frequency, parent):
        self.name = name
        self.frequency = frequency
        self.parent = parent
        self.link = None
        self.children = {}
    
    def increment(self, frequency):
        self.frequency += frequency

# Update the tree with filtered transactions
def update_tree(items, node, header_table):
    first_item = items[0]
    if first_item in node.children:
        node.children[first_item].increment(1)
    else:
        new_node = TreeNode(first_item, 1, node)
        node.children[first_item] = new_node

        # Link the new node to nodes having the same item name
        if not header_table[first_item][1]:
            header_table[first_item][1] = new_node
        else:
            update_header(new_node, header_table[first_item][1])

    if len(items) > 1:
        update_tree(items[1:], node.children[first_item], header_table)

# Update the header table to link similar items
def update_header(node_to_test, target_node):
    while target_node.link is not None:
        target_node = target_node.link
    target_node.link = node_to_test

# Find frequent itemsets
def mine_tree(header_table, min_support, prefix, freq_items):
    sorted_items = [v[0] for v in sorted(header_table.items(), key=lambda p: (p[1][0], p[0]))]
    for base_pat in sorted_items[::-1]:  # Start from bottom up
        new_freq_set = prefix.copy()
        new_freq_set.add(base_pat)
        freq_items.append((new_freq_set, header_table[base_pat][0]))
        
        # Find prefixes
        cond_patt_bases = find_prefix_path(base_pat, header_table[base_pat][1])
        # Create conditional tree
        cond_tree, head = create_tree(cond_patt_bases, min_support)

        if head is not None:
            mine_tree(head, min_support, new_freq_set, freq_items)

# Ascend tree
def ascend_tree(node, prefix_path):
    if node.parent is not None:
        prefix_path.append(node.name)
        ascend_tree(node.parent, prefix_path)

# Find prefix path
def find_prefix_path(base_pat, treeNode):
    cond_pats = {}
    while treeNode is not None:
        prefix_path = []
        ascend_tree(treeNode, prefix_path)
        if len(prefix_path) > 1:
            cond_pats[frozenset(prefix_path[1:])] = treeNode.frequency
        treeNode = treeNode.link
    return cond_pats

# Create the FP-growth tree
def create_tree(transactions, min_support):
    header_table = {}
    for transaction in transactions:
        for item in transaction:
            header_table[item] = header_table.get(item, 0) + 1

    # Remove items not meeting minimum support
    for k in list(header_table):
        if header_table[k] < min_support:
            del(header_table[k])

    freq_item_set = set(header_table.keys())
    if len(freq_item_set) == 0:
        return None, None

    # Initialize header table
    for k in header_table:
        header_table[k] = [header_table[k], None]

    tree_root = TreeNode('Null Set', 1, None)
    for transaction in transactions:
        transaction_filtered = [item for item in transaction if item in freq_item_set]
        transaction_filtered.sort(key=lambda item: header_table[item][0], reverse=True)
        if transaction_filtered:
            update_tree(transaction_filtered, tree_root, header_table)
    return tree_root, header_table

# Load data from file
def load_data(file_path):
    dataset = []
    with open('InputData.txt', 'r') as file:
        for line in file.readlines():
            transaction = line.strip().split(',')  # Adjust delimiter if necessary
            dataset.append(transaction)
    return dataset

# Main function to run FP-growth algorithm
def fpgrowth():
    file_path = "InputData.txt"  # Specify your dataset file name
    transactions = load_data(file_path)
    min_support = int(input("Please enter the minimum support: "))

    # Build the FP-growth tree
    tree, header_table = create_tree(transactions, min_support)

    # Find frequent itemsets
    freq_items = []
    if tree is not None:
        mine_tree(header_table, min_support, set(), freq_items)

    # Write the frequent itemsets to the output file
    output_file_name = "frequent_itemsets.txt"
    with open(output_file_name, 'w') as f:
        for itemset, support in sorted(freq_items, key=lambda i: i[1], reverse=True):
            f.write(f"{' '.join(itemset)}: {support}\n")
    print(f"Frequent itemsets written to {output_file_name}")

# Run the FP-growth algorithm
fpgrowth()

and this is my Database :

dataset = [ ['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'], ['Milk', 'Apple', 'Kidney Beans', 'Eggs'], ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'], ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs'] ]

I tried many things such as ChatGPT and asking my colleagues but its all the same.

2

There are 2 best solutions below

8
John Gordon On

It seems your load_data() function is expecting a different file format than what you actually have.

Try replacing the input file with these contents:

Milk,Onion,Nutmeg,Kidney Beans,Eggs,Yogurt
Dill,Onion,Nutmeg,Kidney Beans,Eggs,Yogurt
Milk,Apple,Kidney Beans,Eggs
Milk,Unicorn,Corn,Kidney Beans,Yogurt
Corn,Onion,Onion,Kidney Beans,Ice cream,Eggs
4
Mark Tolonen On

In comments any library can be used. Here's an implementation using ast.literal_eval to parse the Python syntax in the input:

import ast

# Load data from file
def load_data(file_path):
    with open(file_path) as f:
        return ast.literal_eval(f.read().split('=')[1])

Replacing only the load_data function in the OP's code gives:

Please enter the minimum support: 3
Frequent itemsets written to frequent_itemsets.txt

frequent_itemsets.txt:

Kidney Beans: 5
Onion: 4
Eggs: 4
Yogurt: 3
Kidney Beans Yogurt: 3
Milk: 3
Kidney Beans Milk: 3