Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f38fcf84 authored by Guillaume Jacquart's avatar Guillaume Jacquart
Browse files

feat:5629: Improve comments about whotracksme trackers list parser.

parent 654c37e6
Loading
Loading
Loading
Loading
+62 −50
Original line number Diff line number Diff line
@@ -2,46 +2,52 @@ const fetcher = require("./fetch_promise.js")

const whotracksmeUri = "https://raw.githubusercontent.com/whotracksme/whotracks.me/master/whotracksme/data/assets/trackerdb.sql"

/*
 * createTrackersFromWhoTrackMe(wtmLines) build a list of Trackers objects 
 * from the sql sources of who-tracks-me extracts.
 * 
 * The data will be extracted from the lines like these:
 * INSERT INTO trackers VALUES('1000mercis','1000mercis',13,NULL,'1000mercis','2662',NULL,NULL);
 * INSERT INTO trackers VALUES('161media','Platform161',1,'https://platform161.com/','platform161','730',NULL,NULL);
 * 
 *
 * They was primarily intended to populate the sql table defined like that:
 * CREATE TABLE trackers (
 *   id TEXT NOT NULL UNIQUE,
 *   name TEXT NOT NULL,
 *   category_id INTEGER,
 *   website_url TEXT,
 *   company_id TEXT,
 *   ghostery_id TEXT,
 *   notes TEXT,
 *   alias TEXT REFERENCES trackers (id),
 *   FOREIGN KEY (category_id) REFERENCES categories (id),
 *   FOREIGN KEY (company_id) REFERENCES companies (id)
 * );
 *
 * the caterogry_id is an enum defined as follow :
 * INSERT INTO categories VALUES(1,'advertising');
 * INSERT INTO categories VALUES(2,'audio_video_player');
 * INSERT INTO categories VALUES(3,'cdn');
 * INSERT INTO categories VALUES(4,'comments');
 * INSERT INTO categories VALUES(5,'consent');
 * INSERT INTO categories VALUES(6,'customer_interaction');
 * INSERT INTO categories VALUES(7,'email');
 * INSERT INTO categories VALUES(8,'essential');
 * INSERT INTO categories VALUES(9,'extensions');
 * INSERT INTO categories VALUES(10,'hosting');
 * INSERT INTO categories VALUES(11,'misc');
 * INSERT INTO categories VALUES(12,'pornvertising');
 * INSERT INTO categories VALUES(13,'site_analytics');
 * INSERT INTO categories VALUES(14,'social_media');
 * INSERT INTO categories VALUES(15,'telemetry');
 *
 * 
 */
function createTrackersFromWhoTrackMe(wtmLines) {
  
  const domainsByTrackers = parseDomainTrackers(wtmLines)
  
 
// categories enum values
// INSERT INTO categories VALUES(1,'advertising');
// INSERT INTO categories VALUES(2,'audio_video_player');
// INSERT INTO categories VALUES(3,'cdn');
// INSERT INTO categories VALUES(4,'comments');
// INSERT INTO categories VALUES(5,'consent');
// INSERT INTO categories VALUES(6,'customer_interaction');
// INSERT INTO categories VALUES(7,'email');
// INSERT INTO categories VALUES(8,'essential');
// INSERT INTO categories VALUES(9,'extensions');
// INSERT INTO categories VALUES(10,'hosting');
// INSERT INTO categories VALUES(11,'misc');
// INSERT INTO categories VALUES(12,'pornvertising');
// INSERT INTO categories VALUES(13,'site_analytics');
// INSERT INTO categories VALUES(14,'social_media');
// INSERT INTO categories VALUES(15,'telemetry');

// SQL Schema
// CREATE TABLE trackers (
//   id TEXT NOT NULL UNIQUE,
//   name TEXT NOT NULL,
//   category_id INTEGER,
//   website_url TEXT,
//   company_id TEXT,
//   ghostery_id TEXT,
//   notes TEXT,
//   alias TEXT REFERENCES trackers (id),
//   FOREIGN KEY (category_id) REFERENCES categories (id),
//   FOREIGN KEY (company_id) REFERENCES companies (id)
// );

// Example of the line we will manually parse:
// INSERT INTO trackers VALUES('1000mercis','1000mercis',13,NULL,'1000mercis','2662',NULL,NULL);
// INSERT INTO trackers VALUES('161media','Platform161',1,'https://platform161.com/','platform161','730',NULL,NULL);

  const trackersList = []
  wtmLines.forEach(l => {
    if (l.startsWith("INSERT INTO trackers VALUES(")) {
@@ -67,25 +73,31 @@ function createTrackersFromWhoTrackMe(wtmLines) {
    return trackersList
}

/*
 * parseDomainTrackers(wtmLines), build a Map: trackerId -> [domains]
 * from the sql sources of who-tracks-me extracts.
 * 
 * The data will be extracted from the lines like these:
 * 
 * INSERT INTO tracker_domains VALUES('1000mercis','mmtro.com',NULL);
 * INSERT INTO tracker_domains VALUES('161media','creative-serving.com',NULL);
 * 
 * They was primarily intended to populate the sql table defined like that:
 * 
 * CREATE TABLE tracker_domains (
 *   tracker TEXT NOT NULL,
 *   domain TEXT UNIQUE NOT NULL,
 *   notes TEXT,
 *   FOREIGN KEY (tracker) REFERENCES trackers (id)
 * );
 * 
 */
function parseDomainTrackers(wtmLines) {
// SQL schema
// CREATE TABLE tracker_domains (
//   tracker TEXT NOT NULL,
//   domain TEXT UNIQUE NOT NULL,
//   notes TEXT,
//   FOREIGN KEY (tracker) REFERENCES trackers (id)
//);
//
// Example of the line we will manually parse:
// INSERT INTO tracker_domains VALUES('1000mercis','mmtro.com',NULL);
// INSERT INTO tracker_domains VALUES('161media','creative-serving.com',NULL);

  const domainsByTrackers = {}
  wtmLines.forEach(l => {
    if (l.startsWith("INSERT INTO tracker_domains VALUES(")) {
      const match = l.split("'")
                
            
      const tracker = match[1]
      let domains = domainsByTrackers[tracker]
      if (!domains) { domains = new Set() }