# Summarise index details

This notebook counts the number of rows in each index and calculates the total for the whole repository. It formats the results in nice HTML and Markdown tables for easy browsing.

In [2]:
from urllib.parse import urljoin

import pandas as pd
from IPython.display import HTML, display
from tabulate import tabulate

## Add links and totals to the list of indexes

In [10]:
# Load the index data
df = pd.read_csv("indexes.csv").sort_values(by="title")

In [11]:
def make_download_link(url):
 """
 Create a link to download the CSV file from GitHub
 """
 slug = url.strip("/").split("/")[-1]
 filename = f"{slug}.csv"
 url = urljoin(
 "https://media.githubusercontent.com/media/wragge/srnsw-indexes/master/data/",
 filename,
 )
 link = 'CSV file'.format(url)
 return link


# Create a HTML link to the index data on the NSWSA site
df["web"] = df["url"].apply(lambda x: 'Browse index'.format(x))

# Create a HTML link to download the CSV file from GitHub
df["download"] = df["url"].apply(lambda x: make_download_link(x))

In [12]:
def count_rows(url):
 """
 Count the number of rows in a CSV file.
 """
 slug = url.strip("/").split("/")[-1]
 url = urljoin(
 "https://media.githubusercontent.com/media/wragge/srnsw-indexes/master/data/",
 f"{slug}.csv",
 )
 df = pd.read_csv(url, dtype=object)
 return df.shape[0]


# Add number of rows in the CSV
df["rows"] = df["url"].apply(lambda x: count_rows(x))

In [13]:
# How many rows in the whole repository?
df["rows"].sum()

2481881

In [14]:
# Select the columns that we want
columns = df[["title", "rows", "download", "web"]]

# Create a list of headers
headers = ["Title", "Number of rows", "Download data", "View at State Archives"]

# Use Tabulate to generate a HTML table
display(
 HTML(
 tabulate(
 columns, headers=headers, showindex=False, tablefmt="unsafehtml", intfmt=","
 )
 )
)

# Write a GitHub Markdown formatted version of the table to a file
with open("indexes.md", "w") as md_file:
 md_file.write(
 tabulate(
 columns, headers=headers, showindex=False, tablefmt="github", intfmt=","
 )
 )



----

Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/) project.