github.observatory
by using:
install.packages("remotes")
remotes::install_github("harell/github.observatory")
Sys.setenv(
AWS_ACCESS_KEY_ID = "<your-access-key-id>",
AWS_SECRET_ACCESS_KEY= "<your-secret>",
AWS_REGION = "ap-southeast-2"
)
github.observatory
gives easy access to the project database through a Repository object named Ecosystem. To access the database, instantiate a Ecosystem object.
ecos <- Ecosystem$new()
agent <- Agent$new(ecos)
dplyr
.user_login
this is a mutable name of the Github account chosen by the User. It is apparent in the Github URL. For example, https://github.com/harell corresponds to user_login
= harell
; anduser_id
this is an immutable number assign by Github when the User creates an account. user_id
is not readily available and has to be deduced from an API call. For example, user_id
= 7226303
, corresponds to user_login
= harell
, and the respective API call is https://api.github.com/user/7226303.Tip: Among the two options to identify a User, user_id
is preferable as it stays the same throughout the life of GitHub. See the Mapping Entities section for how to locate user_id
with user_login
.
The Ecosystem gives access to SPECTATOR
, FOLLOWING
, DEPENDENCY
, PACKAGE
, REPO
and USER
tables.
Tip: See the tables content at the Appendix
The Recommendation Agent has five functions:
recommend_repos_to_user
Given a user_id
suggests n
repos the user might like;recommend_users_to_user
Given a user_id
suggests n
users the user might like;query_repos_graph
Given a repo_id
and a method
, find all linked packages in degrees
degrees of separation;query_users_graph
Given a user_id
and a method
, find all linked users in degrees
degrees of separation; andquery_package_stats
Given a CRAN package
name, and a particular statistic
(a function of the data sample), return the value of the requested attribute.Agent
help file for supported methods)
suggested_repos <- agent$recommend_repos_to_user(user_id, n = 5, method = "random")
print(suggested_repos)
#> # A tibble: 5 × 2
#> rank repo_id
#> <int> <int>
#> 1 1 171659421
#> 2 2 85225301
#> 3 3 29305085
#> 4 4 190079713
#> 5 5 107019949
(
suggested_repos
|> dplyr::left_join(ecos$read_REPO(), by = c("repo_id" = "id"))
)
#> # A tibble: 5 × 16
#> rank repo_id package full_…¹ owner…² owner…³ html_…⁴ starg…⁵ watch…⁶ forks…⁷
#> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 1 1.72e8 correl… easyst… Organi… 4.71e7 https:… 324 14 42
#> 2 2 8.52e7 iheatm… ropens… Organi… 1.20e6 https:… 252 21 33
#> 3 3 2.93e7 openair davidc… User 1.24e6 https:… 224 38 95
#> 4 4 1.90e8 ymlthis r-lib/… Organi… 2.26e7 https:… 153 8 8
#> 5 5 1.07e8 linl eddelb… User 6.73e5 https:… 105 7 14
#> # … with 6 more variables: language <chr>, homepage <chr>, created_at <date>,
#> # updated_at <date>, queried_at <date>, processed_at <date>, and abbreviated
#> # variable names ¹full_name, ²owner_type, ³owner_id, ⁴html_url,
#> # ⁵stargazers_count, ⁶watchers_count, ⁷forks_count
Agent
help file for supported methods)
suggested_users <- agent$recommend_users_to_user(user_id, n = 5, method = "random")
print(suggested_users)
#> # A tibble: 5 × 2
#> rank user_id
#> <int> <int>
#> 1 1 4011804
#> 2 2 10975252
#> 3 3 629060
#> 4 4 21288394
#> 5 5 21098186
(
suggested_users
|> dplyr::left_join(ecos$read_USER(), by = c("user_id" = "id"))
)
#> # A tibble: 5 × 18
#> rank user_id login avata…¹ html_…² name publi…³ follo…⁴ follo…⁵ r_fol…⁶
#> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 4011804 pzhaonet https:… https:… Peng… 166 264 6 175
#> 2 2 10975252 andrewli… https:… https:… Andr… 103 190 66 73
#> 3 3 629060 software… https:… https:… Doug… 38 145 10 45
#> 4 4 21288394 abigger87 https:… https:… NA 159 388 546 45
#> 5 5 21098186 greed2411 https:… https:… Jaiv… 58 142 158 36
#> # … with 8 more variables: r_following <dbl>, r_contributor_count <dbl>,
#> # r_watcher_count <dbl>, r_stargazer_count <dbl>, created_at <date>,
#> # updated_at <date>, queried_at <date>, processed_at <date>, and abbreviated
#> # variable names ¹avatar_url, ²html_url, ³public_repos, ⁴followers,
#> # ⁵following, ⁶r_followers
To query the repo graph, you need to supply three input arguments:
repo_id
: See Mapping Entities for how to map a package name to repo_id);degree
: How many neighbourhood to retrieve?method
: Either depends
or reverse depends
depends
returns the package dependencies (as appears in its DESCRIPTION file);reverse depends
returns packages that are dependent on repo_id
.
repo_dep <- agent$query_repos_graph(repo_id, degrees = 1, method = "depends")
print(repo_dep)
#> # A tibble: 11 × 3
#> degree from to
#> <dbl> <dbl> <dbl>
#> 1 0 6427813 6427813
#> 2 1 6427813 137095400
#> 3 1 6427813 77250656
#> 4 1 6427813 163315535
#> 5 1 6427813 15564525
#> 6 1 6427813 91374446
#> 7 1 6427813 19521307
#> 8 1 6427813 73098312
#> 9 1 6427813 45148983
#> 10 1 6427813 92205415
#> 11 1 6427813 67548397
(
repo_dep
|> dplyr::left_join(ecos$read_REPO(), by = c("to" = "id"))
)
#> # A tibble: 11 × 17
#> degree from to package full_…¹ owner…² owner…³ html_…⁴ starg…⁵ watch…⁶
#> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
#> 1 0 6427813 6.43e6 dplyr tidyve… Organi… 2.20e7 https:… 4088 253
#> 2 1 6427813 1.37e8 generi… r-lib/… Organi… 2.26e7 https:… 57 4
#> 3 1 6427813 7.73e7 glue tidyve… Organi… 2.20e7 https:… 606 18
#> 4 1 6427813 1.63e8 lifecy… r-lib/… Organi… 2.26e7 https:… 75 6
#> 5 1 6427813 1.56e7 magrit… tidyve… Organi… 2.20e7 https:… 894 56
#> 6 1 6427813 9.14e7 pillar r-lib/… Organi… 2.26e7 https:… 144 10
#> 7 1 6427813 1.95e7 R6 r-lib/… Organi… 2.26e7 https:… 353 21
#> 8 1 6427813 7.31e7 rlang r-lib/… Organi… 2.26e7 https:… 383 21
#> 9 1 6427813 4.51e7 tibble tidyve… Organi… 2.20e7 https:… 530 33
#> 10 1 6427813 9.22e7 tidyse… r-lib/… Organi… 2.26e7 https:… 106 8
#> 11 1 6427813 6.75e7 vctrs r-lib/… Organi… 2.26e7 https:… 233 10
#> # … with 7 more variables: forks_count <dbl>, language <chr>, homepage <chr>,
#> # created_at <date>, updated_at <date>, queried_at <date>,
#> # processed_at <date>, and abbreviated variable names ¹full_name,
#> # ²owner_type, ³owner_id, ⁴html_url, ⁵stargazers_count, ⁶watchers_count
repo_rev_dep <- agent$query_repos_graph(repo_id, degrees = 1, method = "reverse")
print(repo_rev_dep)
#> # A tibble: 2,601 × 3
#> degree from to
#> <dbl> <dbl> <dbl>
#> 1 0 6427813 6427813
#> 2 1 221466628 6427813
#> 3 1 78038172 6427813
#> 4 1 340095903 6427813
#> 5 1 201294705 6427813
#> 6 1 80061030 6427813
#> 7 1 447452022 6427813
#> 8 1 39196192 6427813
#> 9 1 334079562 6427813
#> 10 1 244196790 6427813
#> # … with 2,591 more rows
(
repo_rev_dep
|> dplyr::left_join(ecos$read_REPO(), by = c("from" = "id"))
)
#> # A tibble: 2,601 × 17
#> degree from to package full_…¹ owner…² owner…³ html_…⁴ starg…⁵ watch…⁶
#> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl>
#> 1 0 6.43e6 6.43e6 dplyr tidyve… Organi… 2.20e7 https:… 4088 253
#> 2 1 2.21e8 6.43e6 AATtoo… spirit… User 3.53e7 https:… 0 1
#> 3 1 7.80e7 6.43e6 abjuti… cran/a… Organi… 6.90e6 https:… 1 3
#> 4 1 3.40e8 6.43e6 academ… cjbarr… User 2.96e7 https:… 211 6
#> 5 1 2.01e8 6.43e6 accept cran/a… Organi… 6.90e6 https:… 0 2
#> 6 1 8.01e7 6.43e6 accucor xiaoya… User 1.43e7 https:… 11 7
#> 7 1 4.47e8 6.43e6 ACDC cran/a… Organi… 6.90e6 https:… 0 2
#> 8 1 3.92e7 6.43e6 ACDm cran/a… Organi… 6.90e6 https:… 1 1
#> 9 1 3.34e8 6.43e6 acrona… cran/a… Organi… 6.90e6 https:… 0 3
#> 10 1 2.44e8 6.43e6 ActCR junrui… User 1.41e7 https:… 1 1
#> # … with 2,591 more rows, 7 more variables: forks_count <dbl>, language <chr>,
#> # homepage <chr>, created_at <date>, updated_at <date>, queried_at <date>,
#> # processed_at <date>, and abbreviated variable names ¹full_name,
#> # ²owner_type, ³owner_id, ⁴html_url, ⁵stargazers_count, ⁶watchers_count
To query the repo graph, you need to supply three input arguments:
repo_id
: See Mapping Entities for how to map a package name to repo_id);degree
: How many neighbourhood to retrieve?method
: Either followers
or following
followers
What users are following user_id
?; orfollowing
What users is user_id
following?user_id
user_followers <- agent$query_users_graph(user_id, degrees = 1, method = "followers")
print(user_followers)
#> # A tibble: 6 × 3
#> degree from to
#> <dbl> <dbl> <dbl>
#> 1 0 7226303 7226303
#> 2 1 52979198 7226303
#> 3 1 31113760 7226303
#> 4 1 10200210 7226303
#> 5 1 3811321 7226303
#> 6 1 6518676 7226303
user_id
followers
(
user_followers
|> dplyr::left_join(ecos$read_USER(), by = c("from" = "id"))
)
#> # A tibble: 6 × 19
#> degree from to login avata…¹ html_…² name publi…³ follo…⁴ follo…⁵
#> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 0 7226303 7226303 harell https:… https:… NA 13 6 64
#> 2 1 52979198 7226303 angelor… https:… https:… Ange… 6 9 18
#> 3 1 31113760 7226303 rfdorne… https:… https:… Rodr… 9 37 183
#> 4 1 10200210 7226303 asdspal https:… https:… NA 22 5 2
#> 5 1 3811321 7226303 maskegg… https:… https:… Am 344 82 3851
#> 6 1 6518676 7226303 bcgalvin https:… https:… Brya… 19 41 367
#> # … with 9 more variables: r_followers <dbl>, r_following <dbl>,
#> # r_contributor_count <dbl>, r_watcher_count <dbl>, r_stargazer_count <dbl>,
#> # created_at <date>, updated_at <date>, queried_at <date>,
#> # processed_at <date>, and abbreviated variable names ¹avatar_url, ²html_url,
#> # ³public_repos, ⁴followers, ⁵following
user_id
following
user_following <- agent$query_users_graph(user_id, degrees = 1, method = "following")
print(user_following)
#> # A tibble: 56 × 3
#> degree from to
#> <dbl> <dbl> <dbl>
#> 1 0 7226303 7226303
#> 2 1 7226303 4196
#> 3 1 7226303 86978
#> 4 1 7226303 93231
#> 5 1 7226303 104391
#> 6 1 7226303 129551
#> 7 1 7226303 205275
#> 8 1 7226303 216319
#> 9 1 7226303 227097
#> 10 1 7226303 470418
#> # … with 46 more rows
user_id
is following
(
user_following
|> dplyr::left_join(ecos$read_USER(), by = c("to" = "id"))
)
#> # A tibble: 56 × 19
#> degree from to login avata…¹ html_…² name publi…³ follo…⁴ follo…⁵
#> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 0 7226303 7226303 harell https:… https:… NA 13 6 64
#> 2 1 7226303 4196 hadley https:… https:… Hadl… 176 22465 0
#> 3 1 7226303 86978 wch https:… https:… Wins… 125 1326 0
#> 4 1 7226303 93231 schloer… https:… https:… Barr… 186 221 6
#> 5 1 7226303 104391 jjallai… https:… https:… J j … 138 1498 0
#> 6 1 7226303 129551 jcheng5 https:… https:… Joe … 159 1780 12
#> 7 1 7226303 205275 jimhest… https:… https:… Jim … 454 1481 8
#> 8 1 7226303 216319 jeroen https:… https:… Jero… 303 1550 356
#> 9 1 7226303 227097 kohske https:… https:… Kohs… 43 61 1
#> 10 1 7226303 470418 jmcphers https:… https:… Jona… 39 389 11
#> # … with 46 more rows, 9 more variables: r_followers <dbl>, r_following <dbl>,
#> # r_contributor_count <dbl>, r_watcher_count <dbl>, r_stargazer_count <dbl>,
#> # created_at <date>, updated_at <date>, queried_at <date>,
#> # processed_at <date>, and abbreviated variable names ¹avatar_url, ²html_url,
#> # ³public_repos, ⁴followers, ⁵following
dplyr
was downloaded every month
package_downloads <- agent$query_package_stats("dplyr", statistic = "monthly downloads")
package_downloads |> tail(n = 12) |> print()
#> # A tibble: 12 × 3
#> date package downloads
#> <date> <chr> <dbl>
#> 1 2021-08-01 dplyr 1483396
#> 2 2021-09-01 dplyr 1630109
#> 3 2021-10-01 dplyr 1821037
#> 4 2021-11-01 dplyr 1680463
#> 5 2021-12-01 dplyr 1438504
#> 6 2022-01-01 dplyr 1411385
#> 7 2022-02-01 dplyr 1552862
#> 8 2022-03-01 dplyr 1764950
#> 9 2022-04-01 dplyr 1791634
#> 10 2022-05-01 dplyr 2132210
#> 11 2022-06-01 dplyr 1855881
#> 12 2022-07-01 dplyr 1659440
plot(
package_downloads$date, package_downloads$downloads,
main = "Monthly downloads of `dplyr` from RStudio CRAN mirror",
type = "b", xlab = "Date", ylab = "Monthly Downloads"
)
#>
#>
#> ## SPECTATOR
#> Rows: 544,970
#> Columns: 3
#> $ repo_id <dbl> 465098622, 465098622, 465098622, 465098622, 465098622, 38745…
#> $ user_id <dbl> 6008722, 6676074, 5859205, 6676074, 10773123, 5303237, 53032…
#> $ user_role <chr> "contributor", "contributor", "watcher", "watcher", "watcher…
#>
#>
#> ## FOLLOWING
#> Rows: 855,628
#> Columns: 2
#> $ from <dbl> 18198064, 18198064, 18198064, 18198064, 30104471, 502944, 502944,…
#> $ to <dbl> 4196, 3680095, 24767886, 31291716, 2377676, 39391, 889502, 187262…
#>
#>
#> ## DEPENDENCY
#> Rows: 94,403
#> Columns: 2
#> $ from <chr> "A3", "A3", "aaSEA", "aaSEA", "aaSEA", "aaSEA", "aaSEA", "aaSEA",…
#> $ to <chr> "pbapply", "xtable", "Bios2cor", "DT", "Hmisc", "magrittr", "netw…
#>
#>
#> ## PACKAGE
#> Rows: 18,481
#> Columns: 6
#> $ package <chr> "A3", "AATtools", "ABACUS", "abbreviate", "abbyyR", "abc"…
#> $ full_name <chr> "cran/A3", "spiritspeak/AATtools", "cran/ABACUS", "cran/a…
#> $ cran_version <chr> "1.0.0", "0.0.2", "1.0.0", "0.1", "0.5.5", "2.2.1", "1.0"…
#> $ cran_date <date> 2015-08-16, 2022-08-12, 2019-09-12, 2021-12-12, 2019-06-…
#> $ title <chr> "Accurate, Adaptable, and Accessible Error Metrics for Pr…
#> $ description <chr> "Supplies tools for tabulating and analyzing the results …
#>
#>
#> ## REPO
#> Rows: 19,198
#> Columns: 15
#> $ id <dbl> 465098622, 387458599, 525857196, 526663547, 526663485…
#> $ package <chr> "rmzqc", "EloSteepness", "tLagInterim", "ambit", "Rob…
#> $ full_name <chr> "ms-quality-hub/rmzqc", "gobbios/elosteepness", "cran…
#> $ owner_type <chr> "Organization", "User", "Organization", "Organization…
#> $ owner_id <dbl> 100718546, 5303237, 6899542, 6899542, 6899542, 267241…
#> $ html_url <chr> "https://github.com/MS-Quality-hub/rmzqc", "https://g…
#> $ stargazers_count <dbl> 0, 0, 0, 0, 0, 2, 0, 0, 0, 18, 1, 0, 0, 0, 0, 0, 0, 0…
#> $ watchers_count <dbl> 3, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,…
#> $ forks_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 1, 0, 0, 0, 0, 0, 0, 0…
#> $ language <chr> "R", "C++", "R", "R", "R", "R", "R", "R", "R", "JavaS…
#> $ homepage <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "https://g3viz.gi…
#> $ created_at <date> 2022-03-02, 2021-07-19, 2022-08-17, 2022-08-19, 2022…
#> $ updated_at <date> 2022-05-31, 2022-01-27, 2022-08-17, 2022-08-19, 2022…
#> $ queried_at <date> 2022-08-21, 2022-08-21, 2022-08-21, 2022-08-21, 2022…
#> $ processed_at <date> 2022-08-21, 2022-08-21, 2022-08-21, 2022-08-21, 2022…
#>
#>
#> ## USER
#> Rows: 142,007
#> Columns: 17
#> $ id <dbl> 94039735, 65328944, 18198064, 56229977, 109984500,…
#> $ login <chr> "lingweiR", "bolrDK", "ksny", "VascoBranco", "YueP…
#> $ avatar_url <chr> "https://avatars.githubusercontent.com/u/94039735?…
#> $ html_url <chr> "https://github.com/lingweiR", "https://github.com…
#> $ name <chr> "Lingwei", "Bo Larsen", "Kevin Snyder", "Vasco Bra…
#> $ public_repos <dbl> 2, 0, 4, 2, 0, 71, 3, 2, 1, 3, 3, 0, 18, 37, 13, 0…
#> $ followers <dbl> 0, 0, 7, 0, 0, 8, 1, 0, 0, 0, 0, 0, 6, 10, 1, 0, 1…
#> $ following <dbl> 0, 0, 6, 0, 0, 9, 0, 0, 0, 1, 0, 0, 6, 12, 0, 0, 5…
#> $ r_followers <dbl> 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 1,…
#> $ r_following <dbl> 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 0, 0, 4,…
#> $ r_contributor_count <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,…
#> $ r_watcher_count <dbl> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
#> $ r_stargazer_count <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,…
#> $ created_at <date> 2021-11-10, 2020-05-14, 2016-03-31, 2019-10-06, 2…
#> $ updated_at <date> 2022-07-31, 2022-04-06, 2022-08-11, 2022-08-12, 2…
#> $ queried_at <date> 2022-08-21, 2022-08-21, 2022-08-21, 2022-08-21, 2…
#> $ processed_at <date> 2022-08-21, 2022-08-21, 2022-08-21, 2022-08-21, 2…