"""Convert Facebook JSON data to plain format for data analysis. Download data from Facebook as JSON file and unzip to local folder, like "C:/temp/facebook-me". Now you can get your friends list with timestamps and your phone numbers stored by Facebook as well as posts. The easiest way to access data is get_something() functions: from friends import get_friends, get_address_book, get_posts directory = "C:/temp/facebook-me" # your path to folder with JSON here friends = get_friends(directory) phones = get_address_book(directory) posts = get_posts(directory) The functions about return lists. You may want to work with generators of tuples or dicts or pandas dataframes as well. You can use reader classes for that: from friends import Friends, Comments, Posts, AddressBook f = Friends("./facebook-epogrebnyak") friends = f.get_tuples() # list of (timestamp, name) tuples friends_dicts = f.get_dicts() # same data as list of dictionaries friends_gen = f.iterate() # useful for streaming large archives friends_df = f.get_dataframe() # pandas DataFrame ready for analysis f.save_csv("./output_folder") # saves data to 'friends.csv' """ import datetime import json from dataclasses import dataclass from pathlib import Path from typing import Callable, List __all__ = [ "save_csv_all" "get_friends", "Friends", "get_address_book", "AddressBook", "get_posts", "Posts", "get_comments", "Comments", "get_reactions", "Reactions", "get_sessions", "Sessions", ] @dataclass class Getter: name: str # file location inside JSON folder path: List[str] # access specific part of file after reading it by read_json() unpack: Callable # convert one element (eg post, comment) to final representation elem: Callable columns: List[str] def make_path(self, directory): return Path(directory).joinpath(*self.path) def iterate(self, directory): path = self.make_path(directory) xs = read_json(path) for x in self.unpack(xs): yield self.elem(x) class FB: """Getter classes by types of content.""" address_book = Getter( name="address_book", path=["about_you", "your_address_books.json"], unpack=lambda xs: xs["address_book"]["address_book"], elem=lambda x: (decode(x["name"]), extract_address_book_details(x)), columns=["name", "contact"], ) friends = Getter( name="friends", path=["friends", "friends.json"], unpack=lambda xs: xs["friends"], elem=lambda x: (x["timestamp"], decode(x["name"]),), columns=["timestamp", "name"], ) posts = Getter( name="posts", # maybe there are several files for posts path=["posts", "your_posts_1.json"], unpack=lambda xs: xs, elem=lambda x: (x["timestamp"], extract_post(x),), columns=["timestamp", "content"], ) comments = Getter( name="comments", path=["comments", "comments.json"], unpack=lambda xs: xs["comments"], elem=lambda x: (x["timestamp"], decode(x["data"][0]["comment"]["comment"])), columns=["timestamp", "content"], ) reactions: Getter = Getter( name="reactions", path=["likes_and_reactions", "posts_and_comments.json"], unpack=lambda xs: xs["reactions"], elem=lambda x: ( x["timestamp"], x["data"][0]["reaction"]["reaction"], decode(x["title"]), ), columns=["timestamp", "reaction", "title"], ) sessions: Getter = Getter( name="sessions", path=["security_and_login_information", "account_activity.json"], unpack=lambda xs: xs["account_activity"], elem=lambda x: ( x["timestamp"], x["ip_address"], decode(x["city"]), decode(x["region"]), x["country"], ), columns=["timestamp", "ip_address", "city", "region", "ip_address"], ) class Reader: """Parent class to associate specific *directory* and Getter.""" getter: Getter = None def __init__(self, directory: str): self.directory = directory @property def columns(self): return self.getter.columns def iterate(self): return self.getter.iterate(self.directory) def get_tuples(self): return list(self.iterate()) def yield_dicts(self): for values in self.iterate(): yield dict(zip(self.columns, values)) def get_dicts(self): return list(self.yield_dicts()) def get_dataframe(self): import pandas as pd # type: ignore df = pd.DataFrame(self.iterate(), columns=self.columns) if "timestamp" in self.columns: df["timestamp"] = df.timestamp.map(lambda x: pd.Timestamp(x, unit="s")) return df @classmethod def csv_path(cls, output_dir): return Path(output_dir) / (cls.getter.name + ".csv") def save_csv(self, output_dir): df = self.get_dataframe() filepath = self.csv_path(output_dir) df.to_csv(filepath, index=None) return filepath def save_csv_all(source_dir: str, output_dir: str): filepaths = [] for reader in Reader.__subclasses__(): fp = reader(source_dir).save_csv(output_dir) filepaths.append(fp) print("\nSaved files:\n ", "\n ".join(map(str, filepaths))) return filepaths class Friends(Reader): getter = FB.friends class Comments(Reader): getter = FB.comments class Posts(Reader): getter = FB.posts class AddressBook(Reader): getter = FB.address_book class Reactions(Reader): getter = FB.reactions class Sessions(Reader): getter = FB.sessions def all_getters(): return [getattr(FB, k) for k in FB.__dict__.keys() if not k.startswith("_")] def get_address_book(directory: str): return AddressBook(directory).get_tuples() def get_friends(directory: str): return Friends(directory).get_tuples() def get_posts(directory: str): return Posts(directory).get_tuples() def get_comments(directory: str): return Comments(directory).get_tuples() def get_reactions(directory: str): return Reactions(directory).get_tuples() def get_sessions(directory: str): return Sessions(directory).get_tuples() def read_json(filename: Path): with open(filename) as f: return json.load(f) # not in use now def extract_timestamp(x: int) -> datetime.datetime: """Convert seconds to timestamp.""" return datetime.datetime.fromtimestamp(x) def decode(string: str) -> str: """Return *string* in readable view. Facebook encodes utf as latin-1 making non-latin chars unreadable. See: https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded """ return string.encode("latin-1").decode("utf-8") def extract_address_book_details(d: dict) -> str: try: return d["details"][0]["contact_point"] except IndexError: return "" def extract_post(d: dict) -> str: try: return decode(d["data"][0]["post"]) except KeyError: return "" def tprint(labels, values, **kwargs): # See https://github.com/mkaz/termgraph/issues/27 from termgraph.termgraph import chart # type: ignore args = { "stacked": False, "width": 50, "no_labels": False, "format": "{:<5.2f}", "suffix": "", "vertical": False, "histogram": False, "no_values": False, } args.update(kwargs) data = [[x] for x in values] chart(colors=[], data=data, args=args, labels=labels) if __name__ == "__main__": import pandas as pd # type: ignore def count(df_): df = df_.set_index("timestamp").groupby(pd.Grouper(freq="M")).count() df.index = df.index.to_period("M") return df def print_count(df_): df = count(df_) tprint( [str(x) for x in df.index], df.iloc[:, 0].tolist(), format="{:<5.0f}", width=20, ) directory = "./facebook-epogrebnyak" phones = get_address_book(directory) print("\nContacts from my phonebook stored by Facebook:") tprint(["Total"], [len(phones)], format="{:d}") friends = get_friends(directory) print("Friends added by month (total %i)" % len(friends)) friends_df = Friends(directory).get_dataframe() print_count(friends_df) comments_df = Comments(directory).get_dataframe() posts_df = Posts(directory).get_dataframe() pubs_df = pd.concat([posts_df, comments_df]) print("\nNumber of posts and comments by month (total %i)" % len(pubs_df)) print_count(pubs_df) reactions_df = Reactions(directory).get_dataframe() print("\nReactions by month (total %i)" % len(reactions_df)) print_count(reactions_df) sessions_df = Sessions(directory).get_dataframe() print("\nSessions by month (total %i)" % len(sessions_df)) print_count(sessions_df) print( "Session locations (includes VPN):", ", ".join(sorted(sessions_df.city.unique().tolist())), ) f = Friends(directory) friends_list = f.get_tuples() # returns list of tuples friends_dicts = f.get_dicts() # returns list of dictionaries friends_gen = f.iterate() # useful for streaming large archives friends_df = f.get_dataframe() # ready for analysis f.save_csv("./output_folder") filepaths = save_csv_all( source_dir="./facebook-epogrebnyak", output_dir="./output_folder" ) # TODO - things to try: # Implementation: # - Enforce dataframe properties via pandera or bulwark # - Generate fake data and folder stucture for testing # - Installable package (via poetry?) # - Rename package and project # - Test on large archive (~1GB) # - Logging strategy # Functionality: # - [x] text-based graphs # - [x] output directory for CSVs # - save all files as CSV # - largest files in the directory - see `tree --sort=size -s .` # - own jpegs in posts # Content: # - all links ever mentioned in posts # - FB Messenger messages # Probably cannot do that: # - all posts where I'm tagged # - face recognition data (useless)