import math
import pandas as pd
from collections import Counter
from typing import Dict, List, Any, Union

# Test dataset - Play Tennis
data = [
    {"Outlook": "Sunny", "Temperature": "Hot", "Humidity": "High", "Wind": "Weak", "PlayTennis": "No"},
    {"Outlook": "Sunny", "Temperature": "Hot", "Humidity": "High", "Wind": "Strong", "PlayTennis": "No"},
    {"Outlook": "Overcast", "Temperature": "Hot", "Humidity": "High", "Wind": "Weak", "PlayTennis": "Yes"},
    {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "High", "Wind": "Weak", "PlayTennis": "Yes"},
    {"Outlook": "Rain", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Weak", "PlayTennis": "Yes"},
    {"Outlook": "Rain", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong", "PlayTennis": "No"},
    {"Outlook": "Overcast", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong", "PlayTennis": "Yes"},
    {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "High", "Wind": "Weak", "PlayTennis": "No"},
    {"Outlook": "Sunny", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Weak", "PlayTennis": "Yes"},
    {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "Normal", "Wind": "Weak", "PlayTennis": "Yes"},
    {"Outlook": "Sunny", "Temperature": "Mild", "Humidity": "Normal", "Wind": "Strong", "PlayTennis": "Yes"},
    {"Outlook": "Overcast", "Temperature": "Mild", "Humidity": "High", "Wind": "Strong", "PlayTennis": "Yes"},
    {"Outlook": "Overcast", "Temperature": "Hot", "Humidity": "Normal", "Wind": "Weak", "PlayTennis": "Yes"},
    {"Outlook": "Rain", "Temperature": "Mild", "Humidity": "High", "Wind": "Strong", "PlayTennis": "No"}
]
df = pd.DataFrame(data)

# Define attributes and target
attributes = [attr for attr in list(df.columns) if attr != "PlayTennis"]
target = "PlayTennis"

print(f"Dataset size: {len(df)} samples")
print(f"Attributes: {attributes}")
print(f"Target: {target}")

def calculate_entropy(labels: List[str]) -> float:
    # Sample return value: 0.9709505944546686
    """
    Calculate the entropy of a list of labels.
    
    Args:
        dataset: pd.DataFrame - the data for which to calculate entropy
        labels: List of class labels
    
    Returns:
        entropy: float - The entropy value
    """
    entropy = 0.0
    c = len(labels)
    S_count = len(labels)
    
    for i in set(labels):
        Si_count = labels.count(i)
    
    # TODO: Implementovať zvyšok výpočtu entrópie
    
    return entropy

def calculate_information_gain(dataset: pd.DataFrame, attribute: str, target: str) -> float:
    # Sample return value: 0.2467498197744391
    """
    Calculate the information gain for a given attribute.
    
    Args:
        dataset: pd.DataFrame - the data to use in gain calculation
        attribute: The attribute to calculate gain for
        target: The target attribute (class label)
    
    Returns:
        gain: float - The information gain value
    """
    gain = 0.0
    values_a = dataset[attribute].unique().tolist()
    S_labels = dataset[target].tolist()
    
    for v in values_a:
        Sv = dataset[dataset[attribute] == v]
        Sv_labels = Sv[target].tolist()
    
    # TODO: Implementovať zvyšok výpočtu informačného zisku
    
    return gain

def find_best_attribute(dataset: pd.DataFrame, attributes: List[str], target: str) -> str:
    # Sample return value: "Outlook"
    """
    Find the attribute with the highest information gain.
    
    Args:
        dataset: pd.DataFrame
        attributes: List of attribute names to consider
        target: The target attribute (class label)
    
    Returns:
        best_attribute: str - The name of the best attribute
    """
    best_attribute = None
    
    # TODO: Implement finding the best attribute
    
    return best_attribute

def build_tree(dataset: pd.DataFrame, attributes: List[str], target: str) -> Dict:
    # Sample return value:
    # {"attribute": "Outlook", "children": {
    #     "Sunny": {"attribute": "Humidity", "children": {"High": {"label": "No"}, "Normal": {"label": "Yes"}}},
    #     "Overcast": {"label": "Yes"},
    #     "Rain": {"attribute": "Wind", "children": {"Weak": {"label": "Yes"}, "Strong": {"label": "No"}}}
    # }}
    """
    Build a decision tree using the ID3 algorithm.
    
    Args:
        dataset: pd.DataFrame
        attributes: List of available attribute names
        target: The target attribute (class label)
    
    Returns:
        tree: Dict - A nested dictionary representing the decision tree
    """

    tree = {}    

    # TODO: Implementovať ID3 algoritmus

    return tree

# Test Part 1: Entropy
test_labels = ["Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", "No", "No"]
entropy_result = calculate_entropy(test_labels)
print(f"Part 1 - Entropy of {test_labels}:")
print(f"Your result: {entropy_result:.4f}")
print(f"Expected: ~0.9911")
print()

# Test Part 2: Information Gain
gain_outlook = calculate_information_gain(df, "Outlook", target)
print(f"Part 2 - Information Gain for 'Outlook': {gain_outlook:.4f}")
print(f"Expected: ~0.2467")
print()

# Test Part 3: Best Attribute
best = find_best_attribute(df, attributes, target)
print(f"Part 3 - Best attribute: {best}")
print(f"Expected: Outlook")
print()

# Test Part 4: Build Decision Tree
tree = build_tree(df, attributes, target)
print("Part 4 - Decision Tree:")
print(tree)
print()

ID3 Decision Tree Algorithm - Student Assignment¶

ID3 Pseudokód¶

Setup - Required Imports¶

Load Dataset¶

Part 1: Calculate Entropy (2 points)¶

Part 2: Calculate Information Gain (2 points)¶

Part 3: Find Best Attribute (2 points)¶

Part 4: Build Decision Tree - ID3 Algorithm (2 points)¶

Testing¶