# Jupyter Documentation Stats Exploration (v2)
____
* Explore traffic and search stats for Jupyter subprojects
* Download combined (merged multi-CSV) traffic and search data
* Visit the **[Jupyter Docs Working Group](https://github.com/jupyter/docs-team-compass)**'s stats **[repo](https://github.com/ericsnekbytes/jupyter_docs_metrics)** (or **[dashboard](https://ericsnekbytes.github.io/jupyter_docs_metrics/)**) for more info/raw CSVs
  * If you're not using a clone of the repo, the ReadTheDocs CSV utility class is **[here](https://github.com/ericsnekbytes/jupyter_docs_metrics/blob/main/doc_metrics.py)**
  * You can drop the **[doc_metrics.py module (raw)](https://raw.githubusercontent.com/ericsnekbytes/jupyter_docs_metrics/main/doc_metrics.py)** next to this notebook to import

In [1]:
import os
import re
import requests
import traceback
import urllib.parse

# Import the CSV wrapper utils (from the stats repo, more info above)

# # Uncomment this if you want to fetch from the URL directly
# DOC_METRICS_MODULE_URL = 'https://raw.githubusercontent.com/ericsnekbytes/jupyter_docs_metrics/main/doc_metrics.py'
# with open(r'doc_metrics.py', 'wb') as fhandle:
#     resp = requests.get(DOC_METRICS_MODULE_URL)
#     if not resp.ok:
#         raise Exception('Could not fetch wrapper module!')
#     fhandle.write(resp.content)

from doc_metrics import csv_to_rows_of_strings, RowColumnView, Metrics

In [2]:
# Write CSV files here
output_folder = 'fetched_data'

# GitHub fetch URLs
base_url = r'https://raw.githubusercontent.com/ericsnekbytes/jupyter_docs_metrics/main/metrics_output/{uname}/{fname}'
traffic_suffix = '_traffic.csv'
search_suffix = '_search.csv'

# (Note: Some subprojects may have empty or missing data for traffic or search, go here
# to view the raw input data for each subproject if you want to see what's [not] available):
# https://github.com/ericsnekbytes/jupyter_docs_metrics/tree/main/subproject_csvs

In [3]:
# Build URLs for traffic and search data for each subproject
subproject_list = sorted([  # The proj folder name in GitHub subproject_csvs folder
    'nbformat',
    'Enterprise Gateway',
    'ipywidgets',
    'Jupyter Client',
    'Jupyter Events',
    'Lumino',
    'JupyterHub',
    'Jupyter Notebook',
    'Lab Server',
    'nbconvert',
    'JupyterLab',
    'Terminado',
    'Jupyter Server',
    'Traitlets',
    'Jupyter',
    'ipykernel',
])
subproject_info = {  # Hold file metadata per project here
    proj_name: {
        'URLS': {
            'TRAFFIC': base_url.format(uname=urllib.parse.quote(proj_name), fname=proj_name.replace(' ', '_')) + traffic_suffix,
            'SEARCH': base_url.format(uname=urllib.parse.quote(proj_name), fname=proj_name.replace(' ', '_')) + search_suffix,
        },
        'TRAFFIC_PATH': None,  # Hold fetched data here
        'SEARCH_PATH': None,  # Hold fetched data here
    } for proj_name in subproject_list
}

In [4]:
# Make an output folder for fetched CSV files
os.makedirs(output_folder, exist_ok=True)
OVERWRITE_FILES = False

# Fetch and write CSVs to disk
for pname, pinfo in subproject_info.items():
    traffic_url, search_url = pinfo['URLS'].values()

    for pth in [traffic_url, search_url]:
        try:
            if pth is None:
                print(f'[E] Bad path: "{pth}"')
                continue

            resp = requests.get(pth)
            if resp.ok:
                output_path = os.path.join(output_folder, os.path.basename(pth))
                if not OVERWRITE_FILES and os.path.exists(output_path):
                    raise Exception('Error, overwrite files is disabled. Change OVERWRITE_FILES to True to clobber any old data.')

                written = 0
                with open(output_path, 'wb') as fhandle:
                    written = fhandle.write(resp.content)
                key = 'TRAFFIC_PATH' if 'traffic' in pth else 'SEARCH_PATH'
                pinfo[key] = output_path
                print(f'[OK] File "{os.path.basename(pth)}" (Status {resp.status_code}) -> Bytes: {written}')
            else:
                print(f'[E] Error (Status {resp.status_code}) for "{os.path.basename(pth)}"')
                continue
        except Exception as err:
            print(f'{traceback.format_exc()}\n\n[E] Error fetching data, read more above...')

[OK] File "Enterprise_Gateway_traffic.csv" (Status 200) -> Bytes: 255600
[OK] File "Enterprise_Gateway_search.csv" (Status 200) -> Bytes: 7105
[OK] File "Jupyter_traffic.csv" (Status 200) -> Bytes: 438097
[OK] File "Jupyter_search.csv" (Status 200) -> Bytes: 191097
[OK] File "Jupyter_Client_traffic.csv" (Status 200) -> Bytes: 147791
[OK] File "Jupyter_Client_search.csv" (Status 200) -> Bytes: 9770
[OK] File "Jupyter_Events_traffic.csv" (Status 200) -> Bytes: 15773
[E] Error (Status 404) for "Jupyter_Events_search.csv"
[OK] File "Jupyter_Notebook_traffic.csv" (Status 200) -> Bytes: 1473820
[OK] File "Jupyter_Notebook_search.csv" (Status 200) -> Bytes: 245700
[OK] File "Jupyter_Server_traffic.csv" (Status 200) -> Bytes: 302907
[OK] File "Jupyter_Server_search.csv" (Status 200) -> Bytes: 31828
[OK] File "JupyterHub_traffic.csv" (Status 200) -> Bytes: 2360259
[OK] File "JupyterHub_search.csv" (Status 200) -> Bytes: 90093
[OK] File "JupyterLab_traffic.csv" (Status 200) -> Bytes: 2469896
[OK

In [5]:
# Pull out a single example, build metrics and explore
lab_traffic_path = subproject_info['JupyterLab']['TRAFFIC_PATH']
lab_traffic = Metrics.build(path=lab_traffic_path)

In [6]:
# Show some basic info about the simple 2d string list wrapper class
print(f'Lab traffic is RowColumnView: {isinstance(lab_traffic, RowColumnView)}\n')
info = re.split(r'\n+', RowColumnView.__doc__)
print(f'Class {RowColumnView.__name__}\n\n' + '\n'.join(info))  # Show the docstring/help message

Lab traffic is RowColumnView: True

Class RowColumnView

Lightweight row index or column-name indexable lists of cell values.
    Headers are separated/removed from data rows.
    Supports:
        - for row in mydata:
              # Do something with the row
        - mydata.headers()
        - len(mydata)  # Only counts data rows (not headers)
        - Index on rows or columns:
              mydata[51]  # Row at index 51
              mydata['Date']  # Date column
        - Get cells from rows or columns by column name
              mydata[51][mydata.col_index('Date')]
              mydata['Date'][51]
        - "ColumnName" in mydata  # Check if sheet has header/column name
        - Get a copy of all rows/columns with rows(), columns()
        - Lazy load rows/columns with rowsi(), columni(), columnsi()
    


In [7]:
lab_traffic.headers()

['Date', 'Version', 'Path', 'Views']

In [8]:
lab_traffic.rows()[:5]

[['2024-04-10 00:00:00', 'latest', '/user/custom_css.html', '1'],
 ['2024-04-10 00:00:00', '4.0.x', '/user/extensions.html', '1'],
 ['2024-04-10 00:00:00', '3.1.x', '/getting_started/starting.html', '1'],
 ['2024-04-10 00:00:00', '3.4.x', '/getting_started/issue.html', '1'],
 ['2024-04-10 00:00:00', '3.3.x', '/user/code_console.html', '2']]

In [9]:
lab_traffic.total_views()

1046223

In [10]:
lab_traffic.most_popular_pages(5)

[('/index.html', 183719),
 ('/getting_started/installation.html', 179258),
 ('/getting_started/starting.html', 95905),
 ('/getting_started/overview.html', 68575),
 ('/privacy_policies.html', 40326)]

In [11]:
lab_traffic.most_popular_versions(5)

[('stable', 643525),
 ('latest', 308051),
 ('3.6.x', 41259),
 ('1.2.x', 12183),
 ('3.5.x', 8281)]

In [12]:
# Get copies of the raw data (rows of strings) to process yourself
# (un-merged raw CSV data is also available in the repo)
string_rows = [lab_traffic.headers()] + lab_traffic.rows()
string_rows[:10]

[['Date', 'Version', 'Path', 'Views'],
 ['2024-04-10 00:00:00', 'latest', '/user/custom_css.html', '1'],
 ['2024-04-10 00:00:00', '4.0.x', '/user/extensions.html', '1'],
 ['2024-04-10 00:00:00', '3.1.x', '/getting_started/starting.html', '1'],
 ['2024-04-10 00:00:00', '3.4.x', '/getting_started/issue.html', '1'],
 ['2024-04-10 00:00:00', '3.3.x', '/user/code_console.html', '2'],
 ['2024-04-10 00:00:00', '3.2.x', '/extension/virtualdom.html', '1'],
 ['2024-04-10 00:00:00', '1.2.x', '/developer/notebook.html', '1'],
 ['2024-04-10 00:00:00', '3.2.x', '/developer/patterns.html', '1'],
 ['2024-04-10 00:00:00', '2.2.x', '/user/file_formats.html', '1']]