twitter · IanJoffe · Aug 29, 2024 · Aug 29, 2024 · Aug 30, 2024 · Sep 4, 2024
diff --git a/sourcecode/scoring/matrix_factorization/matrix_factorization.py b/sourcecode/scoring/matrix_factorization/matrix_factorization.py
@@ -9,6 +9,8 @@
 import numpy as np
 import pandas as pd
 import torch
+from scipy.sparse import csc_matrix
+from scipy.sparse.linalg import svds
 
 
 logger = logging.getLogger("birdwatch.matrix_factorization")
@@ -159,6 +161,68 @@ def get_note_and_rater_id_maps(
 
     return noteIdMap, raterIdMap, ratingFeaturesAndLabels
 
+  def _form_data_matrix(self, saveMemory=False):
+    """
+    Args:
+      saveMemory: If saveMemory is False, the data is stored as a (number of notes) x (number of raters) numpy array
+                  which may be large. However, this allows more manipulation to the array before the svd is taken,
+                  for example the unseen values may be filled with nonzero's.
+                  If saveMemory is True, the data is stored as a scipy csc_matrix, a sparse format similar to the
+                  ratingsFeaturesAndLabels structure. In this case, I demean the filled data, and then set the unseen 
+                  data to 0. 
+
+    Returns:
+      Tuple[np.ndarray OR scipy.sparse.csc_matrix, pd.index, pd.index, float]
+      data_matrix: a dense or sparse representation of the notes x raters matrix
+      df_index: the index of the data_matrix; the notes' indices
+      df_columns: the cols of the data_matrix; the raters' indices
+      subtracted_intercept: the mean that was subtracted from the ratings data
+
+    Calls without saveMemory are really large, I avoid ever having multiple of them around by deleting the reference
+    whenever I'm done with it, but one could also store it as attribute of self to trade off that memory for
+    not having to call this multiple times.
+
+    Currently _form_data_df is only called when a Spectral Initialization occurs. 
+    """
+    if not saveMemory:
+      data_df = self.ratingFeaturesAndLabels.pivot(index='noteId', columns='raterParticipantId', values='helpfulNum')
+      if self.validateModelData is not None:
+        notes_map_to_id = self.noteIdMap.set_index(Constants.noteIndexKey)[c.noteIdKey]
+        rater_map_to_id = self.raterIdMap.set_index(Constants.raterIndexKey)[c.raterParticipantIdKey]
+        valid_row_pos = data_df.index.get_indexer(pd.Series(self.validateModelData.note_indexes.numpy()).map(notes_map_to_id))    # may need to call detach, but I don't think so since they don't have gradients?
+        valid_col_pos = data_df.columns.get_indexer(pd.Series(self.validateModelData.user_indexes.numpy()).map(rater_map_to_id))
+        data_df.values[valid_row_pos, valid_col_pos] = np.nan
+
+      data_matrix = data_df.values
+      mean_matrix = 1/2*np.nan_to_num(np.nanmean(data_matrix, axis=1), nan=np.nanmean(data_matrix))[:,np.newaxis] \
+                      + 1/2*np.nan_to_num(np.nanmean(data_matrix, axis=0), nan=np.nanmean(data_matrix)) \
+                      - np.nanmean(data_matrix)
+      filled_matrix = np.where(np.isnan(data_matrix), mean_matrix, data_matrix)
+
+      return filled_matrix, data_df.index, data_df.columns, np.nanmean(data_matrix)
+
+    else:
+      if self.trainModelData is None:
+        rating_means = self.ratingFeaturesAndLabels["helpfulNum"].mean()
+        demeaned_ratings = self.ratingFeaturesAndLabels["helpfulNum"] - rating_means
+        data_matrix = csc_matrix((demeaned_ratings,
+                                  (self.ratingFeaturesAndLabels["noteIndex"], self.ratingFeaturesAndLabels["raterIndex"])))
+
+        rater_map_to_id = self.raterIdMap.set_index(Constants.raterIndexKey)[c.raterParticipantIdKey]
+        notes_map_to_id = self.noteIdMap.set_index(Constants.noteIndexKey)[c.noteIdKey]
+
+      else:
+        rating_means = self.trainModelData.rating_labels.mean()
+        demeaned_ratings = self.trainModelData.rating_labels - rating_means
+        data_matrix = csc_matrix((demeaned_ratings, 
+                                 (self.trainModelData.note_indexes, self.trainModelData.user_indexes)),
+                                 shape = (max(self.ratingFeaturesAndLabels["noteIndex"])+1, max(self.ratingFeaturesAndLabels["raterIndex"])+1))
+
+        rater_map_to_id = self.raterIdMap.set_index(Constants.raterIndexKey)[c.raterParticipantIdKey]
+        notes_map_to_id = self.noteIdMap.set_index(Constants.noteIndexKey)[c.noteIdKey]
+
+      return data_matrix, notes_map_to_id.values, rater_map_to_id.values, rating_means
+
   def _initialize_parameters(
     self,
     noteInit: Optional[pd.DataFrame] = None,
@@ -183,17 +247,17 @@ def _initialize_parameters(
       noteInit = self.noteIdMap.merge(
         noteInit,
         on=c.noteIdKey,
-        how="left",
-        unsafeAllowed={c.noteIdKey, "noteIndex_y"},
+        how="left" # ,
+        # unsafeAllowed={c.noteIdKey, "noteIndex_y"},    my code wouldn't run with this line and I don't see it in the docs?
       )
 
-      noteInit[c.internalNoteInterceptKey].fillna(0.0, inplace=True)
+      noteInit[c.internalNoteInterceptKey] = noteInit[c.internalNoteInterceptKey].fillna(0.0)    # I had to get rid of these inplace=True's to silence a warning, but I think pandas would make a temporary copy anyway so not sure it saves memory
       self.mf_model.note_intercepts.weight.data = torch.tensor(
         np.expand_dims(noteInit[c.internalNoteInterceptKey].astype(np.float32).values, axis=1)
       )
 
       for i in range(1, self._numFactors + 1):
-        noteInit[c.note_factor_key(i)].fillna(0.0, inplace=True)
+        noteInit[c.note_factor_key(i)] = noteInit[c.note_factor_key(i)].fillna(0.0)
       self.mf_model.note_factors.weight.data = torch.tensor(
         noteInit[[c.note_factor_key(i) for i in range(1, self._numFactors + 1)]]
         .astype(np.float32)
@@ -205,13 +269,13 @@ def _initialize_parameters(
         logger.info("initializing users")
       userInit = self.raterIdMap.merge(userInit, on=c.raterParticipantIdKey, how="left")
 
-      userInit[c.internalRaterInterceptKey].fillna(0.0, inplace=True)
+      userInit[c.internalRaterInterceptKey] = userInit[c.internalRaterInterceptKey].fillna(0.0)
       self.mf_model.user_intercepts.weight.data = torch.tensor(
         np.expand_dims(userInit[c.internalRaterInterceptKey].astype(np.float32).values, axis=1)
       )
 
       for i in range(1, self._numFactors + 1):
-        userInit[c.rater_factor_key(i)].fillna(0.0, inplace=True)
+        userInit[c.rater_factor_key(i)] = userInit[c.rater_factor_key(i)].fillna(0.0)
       self.mf_model.user_factors.weight.data = torch.tensor(
         userInit[[c.rater_factor_key(i) for i in range(1, self._numFactors + 1)]]
         .astype(np.float32)
@@ -221,9 +285,7 @@ def _initialize_parameters(
     if globalInterceptInit is not None:
       if self._log:
         logger.info("initialized global intercept")
-      self.mf_model.global_intercept = torch.nn.parameter.Parameter(
-        torch.ones(1, 1, dtype=torch.float32) * globalInterceptInit
-      )
+      self.mf_model.global_intercept.data = torch.ones(1, 1, dtype=torch.float32) * globalInterceptInit
 
   def _get_parameters_from_trained_model(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
     """
@@ -434,7 +496,8 @@ def _fit_model(
         rating (torch.FloatTensor)
     """
     assert self.mf_model is not None
-    self._create_train_validate_sets(validate_percent)
+    if self.trainModelData is None:
+      self._create_train_validate_sets(validate_percent)
     assert self.trainModelData is not None
 
     prev_loss = 1e10
@@ -495,6 +558,9 @@ def run_mf(
     noteInit: pd.DataFrame = None,
     userInit: pd.DataFrame = None,
     globalInterceptInit: Optional[float] = None,
+    useSpectralInit: Optional[bool] = False,
+    saveMemorySVD: bool = False,
+    additonalSpectralInitIters: Optional[int] = 0,
     specificNoteId: Optional[int] = None,
     validatePercent: Optional[float] = None,
     freezeRaterParameters: bool = False,
@@ -511,6 +577,9 @@ def run_mf(
         noteInit (pd.DataFrame, optional)
         userInit (pd.DataFrame, optional)
         globalInterceptInit (float, optional).
+        useSpectralInit (bool, optional): Whether to use SVD to initialize the factors
+        saveMemorySVD (bool, optional): When useSpectralInit, whether to use a sparse scipy matrix
+        additionalSpectralInitIters (int, optional): How many times to reinitialize and refit with SVD
         specificNoteId (int, optional) Do approximate analysis to score a particular note
 
     Returns:
@@ -550,7 +619,62 @@ def run_mf(
       self.mf_model.freeze_rater_and_global_parameters()
     self.prepare_features_and_labels(specificNoteId)
 
+    if useSpectralInit:
+
+      self._create_train_validate_sets(validatePercent)
+      data_matrix, data_index, data_cols, subtracted_intercept = self._form_data_matrix(saveMemory=saveMemorySVD)
+
+      U, S, Vt = svds(data_matrix, k=self._numFactors)
+      note_factor_init_vals = np.sqrt(S[0]) * U.T[0]
+      user_factor_init_vals = np.sqrt(S[0]) * Vt[0]
+
+      noteInit = pd.DataFrame({
+        c.noteIdKey: data_index,
+        c.note_factor_key(1): note_factor_init_vals, 
+        c.internalNoteInterceptKey: np.zeros(len(note_factor_init_vals))
+      })
+      userInit = pd.DataFrame({
+        c.raterParticipantIdKey: data_cols,
+        c.rater_factor_key(1): user_factor_init_vals, 
+        c.internalRaterInterceptKey: np.zeros(len(user_factor_init_vals))
+      })
+      globalInterceptInit = subtracted_intercept
+      del data_matrix   # save lots of memory if the data_matrix is numpy
+      # to further save memory, one could del data_df as soon as data_matrix is formed, but then would have to retrieve the ordering of ID's again when forming noteInit and userInit
+
+      self._initialize_parameters(noteInit, userInit, globalInterceptInit)
+
     train_loss, loss, validate_loss = self._fit_model(validatePercent)
+
+    if useSpectralInit:
+      for _ in range(additonalSpectralInitIters):
+        data_df = self._form_data_df()
+        data_matrix = data_df.values
+        noteParams, raterParams = self._get_parameters_from_trained_model()
+        intercepts_matrix = np.add.outer(noteParams["internalNoteIntercept"].to_numpy(), raterParams["internalRaterIntercept"].to_numpy())
+        if self._useGlobalIntercept:
+          intercepts_matrix = intercepts_matrix + self.mf_model.global_intercept.item()
+        filled_matrix = np.where(np.isnan(data_matrix), intercepts_matrix, data_matrix)
+
+        U, S, Vt = svds(filled_matrix, k=self._numFactors)
+        note_factor_init_vals = np.sqrt(S[0]) * U.T[0]
+        user_factor_init_vals = np.sqrt(S[0]) * Vt[0]
+
+        noteInit = pd.DataFrame({
+          c.noteIdKey: data_df.index,
+          c.note_factor_key(1): note_factor_init_vals, 
+          c.internalNoteInterceptKey: np.zeros(len(note_factor_init_vals))
+        })
+        userInit = pd.DataFrame({
+          c.raterParticipantIdKey: data_df.columns,
+          c.rater_factor_key(1): user_factor_init_vals, 
+          c.internalRaterInterceptKey: np.zeros(len(user_factor_init_vals))
+        })
+        del data_df, data_matrix
+
+        self._initialize_parameters(noteInit, userInit, None)
+        train_loss, loss, validate_loss = self._fit_model(validatePercent)
+
     if self._normalizedLossHyperparameters is not None:
       _, raterParams = self._get_parameters_from_trained_model()
       assert self.modelData is not None