users_path = "../data/Users.txt"
checkins_sna_path = "../data/Checkins_SNA.txt"
output_part1 = "./output_part1"

# Exercise 2.1
users_rdd = sc.textFile(users_path)
checkins_sna_rdd = sc.textFile(checkins_sna_path)

def CodU_City_Country(line):
    fields = line.split(",")
    CodU = fields[0]
    City = fields[3]
    Country = fields[4]
    return (CodU, (City, Country))
users_info_rdd = users_rdd.map(CodU_City_Country)
italian_users_rdd = users_info_rdd.filter(lambda x: x[1][1] == "Italy")

def CodU_City(user):
    CodU = user[0]
    City = user[1][0]
    return (CodU, City)
italian_users_city_rdd = italian_users_rdd.map(CodU_City)

def CodU_Year(line):
    fields = line.split(",")
    CodU = fields[0]
    Timestamp = fields[1]
    Year = Timestamp.split("/")[0]
    return (CodU, Year)
checkins_year_rdd = checkins_sna_rdd.map(CodU_Year)
checkins_2024_rdd = checkins_year_rdd.filter(lambda x: x[1] == "2024")
checkins_2024_count_rdd = checkins_2024_rdd.map(lambda x: (x[0], 1))

checkins_per_user_rdd = checkins_2024_count_rdd.reduceByKey(lambda a, b: a + b)
italian_users_checkins_rdd = checkins_per_user_rdd.join(italian_users_city_rdd)

def City_NumCheckins(record):
    CodU = record[0]
    numCheckins = record[1][0]
    City = record[1][1]
    return (numCheckins, (CodU, City))
checkins_city_rdd = italian_users_checkins_rdd.map(City_NumCheckins)
# Top is more efficient
top10_rdd = checkins_city_rdd.sortByKey(ascending=False).take(10)
top10_final_rdd = sc.parallelize(top10_rdd)

def writeOutput(record):
    CodU = record[1][0]
    City = record[1][1]
    return CodU + "," + City
output_part1_rdd = top10_final_rdd.map(writeOutput)
output_part1_rdd.saveAsTextFile(output_part1)



# Exercise 2.2
checkins_snb_path = "../data/Checkins_SNB.txt"
output_part2 = "./output_part2"

checkins_snb_rdd = sc.textFile(checkins_snb_path)

def CodU_Country(line):
    fields = line.split(",")
    CodU = fields[0]
    Country = fields[4]
    return (CodU, Country)
users_country_rdd = users_rdd.map(CodU_Country)
italian_users_rdd = users_country_rdd.filter(lambda x: x[1] == "Italy")
italian_users_key_rdd = italian_users_rdd.map(lambda x: (x[0], 1))

def CodU_POIID(line):
    fields = line.split(",")
    CodU = fields[0]
    POIID = fields[2]
    return ((CodU, POIID), 1)
checkins_sna_user_poi_rdd = checkins_sna_rdd.map(CodU_POIID)
user_poi_checkins_rdd = checkins_sna_user_poi_rdd.reduceByKey(lambda a, b: a + b)
superstar_user_poi_rdd = user_poi_checkins_rdd.filter(lambda x: x[1] >= 1000)

def CodU_One_POI(record):
    CodU = record[0][0]
    return (CodU, 1)
superstar_poi_per_user_rdd = superstar_user_poi_rdd.map(CodU_One_POI)
num_superstar_pois_rdd = superstar_poi_per_user_rdd.reduceByKey(lambda a, b: a + b)
eligible_superstars_rdd = num_superstar_pois_rdd.filter(lambda x: x[1] >= 20)

def CodU_One_SNB(line):
    fields = line.split(",")
    CodU = fields[0]
    return (CodU, 1)
checkins_snb_count_rdd = checkins_snb_rdd.map(CodU_One_SNB)
checkins_snb_per_user_rdd = checkins_snb_count_rdd.reduceByKey(lambda a, b: a + b)
italian_users_snb_rdd = italian_users_key_rdd.leftOuterJoin(checkins_snb_per_user_rdd)

def CodU_SNBCount(record):
    CodU = record[0]
    snb_count = record[1][1]
    if snb_count is None:
        snb_count = 0
    return (CodU, snb_count)
snb_count_complete_rdd = italian_users_snb_rdd.map(CodU_SNBCount)
eligible_snb_users_rdd = snb_count_complete_rdd.filter(lambda x: x[1] < 10)

final_users_rdd = eligible_superstars_rdd.join(eligible_snb_users_rdd)

def writeOutput(record):
    CodU = record[0]
    numPOIs = record[1][0]
    return CodU + "," + str(numPOIs)
output_part2_rdd = final_users_rdd.map(writeOutput)
output_part2_rdd.saveAsTextFile(output_part2)

