# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""pagerank_deps computes the PageRank scores of a blended graph
of dependency and shared authorship. It is meant to be used
as a starting point for a more sophisticated analysis.

Install dependencies:
  # Python >= 3.9 < 3.13 should work
  pip install "networkx>=3.2.1" "numpy>=1.26.2" "pandas>=2.1.3" \
  "requests>=2.31.0" "scipy>=1.11.3" "tenacity>=8.2.3" "scikit-learn>=1.3.2"

Usage:

  python pagerank_deps.py -p packages.csv -c commits.csv

Packages include a list of deps.dev packages to analyze. Their transitive
dependencies will also be added into the graph for analysis. Note that
we "collapse" nodes that only differ by version to simplify the analysis,
e.g., k8s.io/kubernetes@v1.28.2 will be resolved to the same node as
k8s.io/kubernetes@v1.28.1.

An example of packages.csv:

```csv
name,system,version
k8s.io/kubernetes,go,v1.28.3
google.golang.org/protobuf,go,v1.31.0
```

Commits include a list of commits representing authorship data. It is a
list of commits, where each commit should map to a node in the graph.

An example of commits.csv:

```csv
dependency.name,dependency.system,dependency.version,author.email
k8s.io/kubernetes,go,v1.28.3,alice@example.com
google.golang.org/protobuf,go,v1.31.0,bob@example.com
github.com/google/go-cmp,go,v0.5.5,alice@example.com
```

A starting point for commits.csv can be obtained via:
```shell
# dependency{.name, .system, .version} needs to be supplied afterwards.
echo "hash,author.email" && git --no-pager log --pretty=format:'%h,%ae'
```

The flag `--input-dependency-graph` tells the script that the input
`--packages` is actually an edgelist that denotes the entire dependency graph,
which bypasses the deps.dev API.

An example of an [edgelist](https://networkx.org/documentation/stable/reference/readwrite/edgelist.html)
file:

```
NPM:A NPM:B
NPM:B NPM:C
NPM:B NPM:D
```
"""

from __future__ import annotations
import argparse
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
from urllib.parse import quote
import networkx as nx
import numpy as np
import pandas as pd
import requests
from scipy.sparse import coo_array
from scipy.sparse import spmatrix
from sklearn.preprocessing import normalize
from tenacity import retry, stop_after_attempt, wait_random


class NameMap:
  """NameMap is a mapping from names to indices.

  NameMap provides bidirectional mapping between names and indices.
  """

  names: list[str]
  indices: dict[str, int]

  def __init__(self):
    self.names = []
    self.indices = {}

  def index(self, name: str) -> int:
    if name not in self.indices:
      self.indices[name] = len(self.indices)
      self.names.append(name)
    return self.indices[name]

  def __getitem__(self, item: str) -> int:
    if item not in self.indices:
      raise KeyError(item)
    return self.indices[item]

  def __contains__(self, item: str) -> bool:
    return item in self.indices

  def __len__(self) -> int:
    return len(self.names)


@dataclass
class Node:
  """A Node in the dependency graph returned by deps.dev."""

  version_key: VersionKey
  bundled: bool
  relation: str
  errors: list[str]


@dataclass
class Edge:
  """An Edge in the dependency graph returned by deps.dev."""

  from_node: int
  to_node: int
  requirement: str


@dataclass
class DependencyGraph:
  """A DependencyGraph returned by deps.dev."""

  nodes: list[Node]
  edges: list[Edge]
  error: str


def parse_dependency_graph(json_data) -> DependencyGraph:
  """Given raw JSON data from deps.dev, parse it into a DependencyGraph."""
  nodes = []
  for node_data in json_data['nodes']:
    version_key = node_data['versionKey']
    node = Node(
        version_key=VersionKey(**version_key),
        bundled=node_data['bundled'],
        relation=node_data['relation'],
        errors=node_data['errors'],
    )
    nodes.append(node)

  edges = []
  for edge_data in json_data['edges']:
    edge = Edge(
        from_node=edge_data['fromNode'],
        to_node=edge_data['toNode'],
        requirement=edge_data['requirement'],
    )
    edges.append(edge)

  return DependencyGraph(nodes=nodes, edges=edges, error=json_data['error'])


@dataclass
class VersionKey:
  """A VersionKey is a unique identifier for a package version."""

  name: str
  system: str
  version: str

  def __post_init__(self):
    self.system = self.system.upper()  # canonicalize system to be upper-case.

  # For robustness when dealing with larger data, we add retry logic.
  # See also https://docs.deps.dev/bigquery/v1/ for BigQuery bulk analysis.
  @retry(stop=stop_after_attempt(3), wait=wait_random(min=2, max=4))
  def get_dependencies(self) -> DependencyGraph:
    name_quoted = quote(self.name, safe='')
    url = f'https://api.deps.dev/v3alpha/systems/{self.system}/packages/{name_quoted}/versions/{self.version}:dependencies'
    resp = requests.get(url).json()
    return parse_dependency_graph(resp)

  def system_with_name(self) -> str:
    """A unique key for a package in a system, ignoring version."""
    return self.system + ':' + self.name


def matrix_form(graph: nx.Graph) -> tuple[NameMap, spmatrix]:
  """Convert an arbitrarily labeled graph into a set of names and a spmatrix."""
  names = NameMap()
  vals = []
  rows = []
  cols = []
  for u, v in graph.edges():
    vals.append(1)
    rows.append(names.index(u))
    cols.append(names.index(v))
  mat = coo_array((vals, (rows, cols)), shape=(len(graph), len(graph))).tocsr()
  return names, mat


def compute_deps_matrix(
    packages: list[VersionKey], num_threads: int
) -> tuple[NameMap, spmatrix]:
  """Given packages, take their dependency graphs and merge them."""
  with ThreadPoolExecutor(max_workers=num_threads) as executor:
    graphs = list(executor.map(lambda pkg: pkg.get_dependencies(), packages))
  merged_graph = nx.DiGraph()
  for g in graphs:
    for e in g.edges:
      u, v = (
          g.nodes[e.from_node].version_key.system_with_name(),
          g.nodes[e.to_node].version_key.system_with_name(),
      )
      merged_graph.add_edge(u, v)
  return matrix_form(merged_graph)


def compute_authorship_matrix(
    pkg_names: NameMap,
    commits: pd.DataFrame,
) -> tuple[NameMap, spmatrix]:
  """Take commits and construct the authorship matrix.

  Only packages in pkg_names are considered.
  """
  author_names = NameMap()
  author_coords_i = []
  author_coords_j = []
  for _, r in commits.iterrows():
    name = r['dependency.name']
    system = r['dependency.system']
    version = r.get('dependency.version', '')
    k = VersionKey(name, system, version).system_with_name()
    if k in pkg_names:
      author_coords_i.append(pkg_names.index(k))
      author_coords_j.append(author_names.index(r['author.email']))
  author_matrix = coo_array(
      (
          np.ones(len(author_coords_i)),
          (author_coords_i, author_coords_j),
      ),
      shape=(len(pkg_names), len(author_names)),
  ).tocsr()
  return author_names, author_matrix


def compute_graph(
    pkg_names: NameMap, deps: spmatrix, authors: spmatrix
) -> nx.DiGraph:
  """Given the D and A matrix, compute a nx graph."""
  D = normalize(deps, 'l1')
  A = normalize(authors, 'l2')
  adj_repo = D + A @ A.T
  graph = nx.convert_matrix.from_scipy_sparse_array(adj_repo)
  ix2name = {v: k for k, v in pkg_names.indices.items()}
  nx.relabel_nodes(graph, ix2name, copy=False)
  return graph


def ingest_packages(pkgs_df: pd.DataFrame) -> list[VersionKey]:
  """Convert packages in dataframe form to a list of VersionKeys."""
  return [
      VersionKey(r['name'], r['system'], r['version'])
      for _, r in pkgs_df.iterrows()
  ]


if __name__ == '__main__':
  parser = argparse.ArgumentParser(description='Process CSV data')
  parser.add_argument(
      '-p',
      '--packages',
      type=str,
      help=(
          'Path to the package CSV file. Each row represents a deps.dev'
          ' package. Should contain columns name, system, and version. '
      ),
      required=True,
  )
  parser.add_argument(
      '-c',
      '--commits',
      type=str,
      help=(
          'Path to the commits CSV file. Each row represents a commit. Should'
          ' contain columns dependency.name, dependency.system,'
          ' dependency.version, author.email. The first three columns map to'
          ' the columns in the packages CSV file.'
      ),
      required=True,
  )
  parser.add_argument(
      '--input-dependency-graph',
      action='store_true',
      help='If set, read the input packages file as an edgelist of graph D.',
  )
  parser.add_argument(
      '--num-threads',
      type=int,
      default=8,
      help='Number of threads to use to concurrently send API requests.',
  )
  args = parser.parse_args()
  if not args.input_dependency_graph:
    packages_df = pd.read_csv(args.packages)
    packages = ingest_packages(packages_df)
    pkg_names, D = compute_deps_matrix(packages, num_threads=args.num_threads)
  else:
    pkg_names, D = matrix_form(nx.read_edgelist(args.packages))
  commits_df = pd.read_csv(args.commits)
  author_names, A = compute_authorship_matrix(pkg_names, commits_df)
  G = compute_graph(pkg_names, D, A)
  print('Graph has ', len(G), ' nodes and ', len(G.edges), ' edges.')
  pr = nx.pagerank(G)
  print('PageRank on repos: ', pr)