tourists_path = "../data/Tourists.txt"
pois_path = "../data/POIs.txt"
visits_path = "../data/Visits.txt"

# Exercise 2.1

output_part1 = "./output_part1"

pois_rdd = sc.textFile(pois_path)
visits_rdd = sc.textFile(visits_path)
tourists_rdd = sc.textFile(tourists_path)

def POIID_Country(line):
    fields = line.split(",")
    POIID = fields[0]
    country = fields[5]
    return (POIID, country)
pois_country_rdd = pois_rdd.map(POIID_Country)

italian_pois_rdd = pois_country_rdd.filter(lambda x: x[1] == "Italy")
italian_pois_flag_rdd = italian_pois_rdd.map(lambda x: (x[0], 1))


def POIID_CodT(line):
    fields = line.split(",")
    CodT = fields[0]
    POIID = fields[2]
    return (POIID, CodT)
visits_by_poi_rdd = visits_rdd.map(POIID_CodT)

italian_visits_rdd = visits_by_poi_rdd.join(italian_pois_flag_rdd)
visits_per_tourist_rdd = italian_visits_rdd.map(lambda x: (x[1][0], 1))
tourist_visits_count_rdd = visits_per_tourist_rdd.reduceByKey(lambda a, b: a + b)
tourist_visits_count_rdd.cache()



max_visits = tourist_visits_count_rdd.map(lambda x: x[1]).max()
top_tourists_rdd = tourist_visits_count_rdd.filter(lambda x: x[1] == max_visits)


result_part1_rdd = top_tourists_rdd.keys()
result_part1_rdd.saveAsTextFile(output_part1)



#Exercise 2.2

output_part2 = "./output_part2"

def POIID_Category_Country(line):
    fields = line.split(",")
    POIID = fields[0]
    category = fields[2]
    country = fields[5]
    return (POIID, (category, country))
pois_info_rdd = pois_rdd.map(POIID_Category_Country)
italian_pois_rdd = pois_info_rdd.filter(lambda x: x[1][1] == "Italy")
italian_pois_category_rdd = italian_pois_rdd.map(lambda x: (x[0], x[1][0]))


def Visit_POIID_CodT_Year(line):
    fields = line.split(",")
    CodT = fields[0]
    timestamp = fields[1]
    POIID = fields[2]
    year = timestamp.split("/")[0]
    return (POIID, (CodT, year))
visits_info_rdd = visits_rdd.map(Visit_POIID_CodT_Year)
visits_2024_rdd = visits_info_rdd.filter(lambda x: x[1][1] == "2024")
italian_visits_2024_rdd = visits_2024_rdd.join(italian_pois_category_rdd)

def CodT_Category(record):
    CodT = record[1][0][0]
    category = record[1][1]
    return (CodT, category)
distinct_tourist_category_rdd = italian_visits_2024_rdd.map(CodT_Category).distinct()
categories_per_tourist_rdd = distinct_tourist_category_rdd.map(lambda x: (x[0], 1)).reduceByKey(lambda a, b: a + b)


def CodT_Zero(line):
    fields = line.split(",")
    CodT = fields[0]
    return (CodT, 0)
all_tourists_rdd = tourists_rdd.map(CodT_Zero)
final_result_rdd = all_tourists_rdd.leftOuterJoin(categories_per_tourist_rdd)

def writeOutput(record):
    CodT = record[0]
    numCategories = record[1][1]
    if numCategories is None:
        numCategories = 0
    return CodT + "," + str(numCategories)
output_part2_rdd = final_result_rdd.map(writeOutput)

output_part2_rdd.saveAsTextFile(output_part2)
