Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spectral Initialization #263

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 134 additions & 10 deletions sourcecode/scoring/matrix_factorization/matrix_factorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import numpy as np
import pandas as pd
import torch
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds


logger = logging.getLogger("birdwatch.matrix_factorization")
Expand Down Expand Up @@ -159,6 +161,68 @@ def get_note_and_rater_id_maps(

return noteIdMap, raterIdMap, ratingFeaturesAndLabels

def _form_data_matrix(self, saveMemory=False):
"""
Args:
saveMemory: If saveMemory is False, the data is stored as a (number of notes) x (number of raters) numpy array
which may be large. However, this allows more manipulation to the array before the svd is taken,
for example the unseen values may be filled with nonzero's.
If saveMemory is True, the data is stored as a scipy csc_matrix, a sparse format similar to the
ratingsFeaturesAndLabels structure. In this case, I demean the filled data, and then set the unseen
data to 0.

Returns:
Tuple[np.ndarray OR scipy.sparse.csc_matrix, pd.index, pd.index, float]
data_matrix: a dense or sparse representation of the notes x raters matrix
df_index: the index of the data_matrix; the notes' indices
df_columns: the cols of the data_matrix; the raters' indices
subtracted_intercept: the mean that was subtracted from the ratings data

Calls without saveMemory are really large, I avoid ever having multiple of them around by deleting the reference
whenever I'm done with it, but one could also store it as attribute of self to trade off that memory for
not having to call this multiple times.

Currently _form_data_df is only called when a Spectral Initialization occurs.
"""
if not saveMemory:
data_df = self.ratingFeaturesAndLabels.pivot(index='noteId', columns='raterParticipantId', values='helpfulNum')
if self.validateModelData is not None:
notes_map_to_id = self.noteIdMap.set_index(Constants.noteIndexKey)[c.noteIdKey]
rater_map_to_id = self.raterIdMap.set_index(Constants.raterIndexKey)[c.raterParticipantIdKey]
valid_row_pos = data_df.index.get_indexer(pd.Series(self.validateModelData.note_indexes.numpy()).map(notes_map_to_id)) # may need to call detach, but I don't think so since they don't have gradients?
valid_col_pos = data_df.columns.get_indexer(pd.Series(self.validateModelData.user_indexes.numpy()).map(rater_map_to_id))
data_df.values[valid_row_pos, valid_col_pos] = np.nan

data_matrix = data_df.values
mean_matrix = 1/2*np.nan_to_num(np.nanmean(data_matrix, axis=1), nan=np.nanmean(data_matrix))[:,np.newaxis] \
+ 1/2*np.nan_to_num(np.nanmean(data_matrix, axis=0), nan=np.nanmean(data_matrix)) \
- np.nanmean(data_matrix)
filled_matrix = np.where(np.isnan(data_matrix), mean_matrix, data_matrix)

return filled_matrix, data_df.index, data_df.columns, np.nanmean(data_matrix)

else:
if self.trainModelData is None:
rating_means = self.ratingFeaturesAndLabels["helpfulNum"].mean()
demeaned_ratings = self.ratingFeaturesAndLabels["helpfulNum"] - rating_means
data_matrix = csc_matrix((demeaned_ratings,
(self.ratingFeaturesAndLabels["noteIndex"], self.ratingFeaturesAndLabels["raterIndex"])))

rater_map_to_id = self.raterIdMap.set_index(Constants.raterIndexKey)[c.raterParticipantIdKey]
notes_map_to_id = self.noteIdMap.set_index(Constants.noteIndexKey)[c.noteIdKey]

else:
rating_means = self.trainModelData.rating_labels.mean()
demeaned_ratings = self.trainModelData.rating_labels - rating_means
data_matrix = csc_matrix((demeaned_ratings,
(self.trainModelData.note_indexes, self.trainModelData.user_indexes)),
shape = (max(self.ratingFeaturesAndLabels["noteIndex"])+1, max(self.ratingFeaturesAndLabels["raterIndex"])+1))

rater_map_to_id = self.raterIdMap.set_index(Constants.raterIndexKey)[c.raterParticipantIdKey]
notes_map_to_id = self.noteIdMap.set_index(Constants.noteIndexKey)[c.noteIdKey]

return data_matrix, notes_map_to_id.values, rater_map_to_id.values, rating_means

def _initialize_parameters(
self,
noteInit: Optional[pd.DataFrame] = None,
Expand All @@ -183,17 +247,17 @@ def _initialize_parameters(
noteInit = self.noteIdMap.merge(
noteInit,
on=c.noteIdKey,
how="left",
unsafeAllowed={c.noteIdKey, "noteIndex_y"},
how="left" # ,
# unsafeAllowed={c.noteIdKey, "noteIndex_y"}, my code wouldn't run with this line and I don't see it in the docs?
)

noteInit[c.internalNoteInterceptKey].fillna(0.0, inplace=True)
noteInit[c.internalNoteInterceptKey] = noteInit[c.internalNoteInterceptKey].fillna(0.0) # I had to get rid of these inplace=True's to silence a warning, but I think pandas would make a temporary copy anyway so not sure it saves memory
self.mf_model.note_intercepts.weight.data = torch.tensor(
np.expand_dims(noteInit[c.internalNoteInterceptKey].astype(np.float32).values, axis=1)
)

for i in range(1, self._numFactors + 1):
noteInit[c.note_factor_key(i)].fillna(0.0, inplace=True)
noteInit[c.note_factor_key(i)] = noteInit[c.note_factor_key(i)].fillna(0.0)
self.mf_model.note_factors.weight.data = torch.tensor(
noteInit[[c.note_factor_key(i) for i in range(1, self._numFactors + 1)]]
.astype(np.float32)
Expand All @@ -205,13 +269,13 @@ def _initialize_parameters(
logger.info("initializing users")
userInit = self.raterIdMap.merge(userInit, on=c.raterParticipantIdKey, how="left")

userInit[c.internalRaterInterceptKey].fillna(0.0, inplace=True)
userInit[c.internalRaterInterceptKey] = userInit[c.internalRaterInterceptKey].fillna(0.0)
self.mf_model.user_intercepts.weight.data = torch.tensor(
np.expand_dims(userInit[c.internalRaterInterceptKey].astype(np.float32).values, axis=1)
)

for i in range(1, self._numFactors + 1):
userInit[c.rater_factor_key(i)].fillna(0.0, inplace=True)
userInit[c.rater_factor_key(i)] = userInit[c.rater_factor_key(i)].fillna(0.0)
self.mf_model.user_factors.weight.data = torch.tensor(
userInit[[c.rater_factor_key(i) for i in range(1, self._numFactors + 1)]]
.astype(np.float32)
Expand All @@ -221,9 +285,7 @@ def _initialize_parameters(
if globalInterceptInit is not None:
if self._log:
logger.info("initialized global intercept")
self.mf_model.global_intercept = torch.nn.parameter.Parameter(
torch.ones(1, 1, dtype=torch.float32) * globalInterceptInit
)
self.mf_model.global_intercept.data = torch.ones(1, 1, dtype=torch.float32) * globalInterceptInit

def _get_parameters_from_trained_model(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Expand Down Expand Up @@ -434,7 +496,8 @@ def _fit_model(
rating (torch.FloatTensor)
"""
assert self.mf_model is not None
self._create_train_validate_sets(validate_percent)
if self.trainModelData is None:
self._create_train_validate_sets(validate_percent)
assert self.trainModelData is not None

prev_loss = 1e10
Expand Down Expand Up @@ -495,6 +558,9 @@ def run_mf(
noteInit: pd.DataFrame = None,
userInit: pd.DataFrame = None,
globalInterceptInit: Optional[float] = None,
useSpectralInit: Optional[bool] = False,
saveMemorySVD: bool = False,
additonalSpectralInitIters: Optional[int] = 0,
specificNoteId: Optional[int] = None,
validatePercent: Optional[float] = None,
freezeRaterParameters: bool = False,
Expand All @@ -511,6 +577,9 @@ def run_mf(
noteInit (pd.DataFrame, optional)
userInit (pd.DataFrame, optional)
globalInterceptInit (float, optional).
useSpectralInit (bool, optional): Whether to use SVD to initialize the factors
saveMemorySVD (bool, optional): When useSpectralInit, whether to use a sparse scipy matrix
additionalSpectralInitIters (int, optional): How many times to reinitialize and refit with SVD
specificNoteId (int, optional) Do approximate analysis to score a particular note

Returns:
Expand Down Expand Up @@ -550,7 +619,62 @@ def run_mf(
self.mf_model.freeze_rater_and_global_parameters()
self.prepare_features_and_labels(specificNoteId)

if useSpectralInit:

self._create_train_validate_sets(validatePercent)
data_matrix, data_index, data_cols, subtracted_intercept = self._form_data_matrix(saveMemory=saveMemorySVD)

U, S, Vt = svds(data_matrix, k=self._numFactors)
note_factor_init_vals = np.sqrt(S[0]) * U.T[0]
user_factor_init_vals = np.sqrt(S[0]) * Vt[0]

noteInit = pd.DataFrame({
c.noteIdKey: data_index,
c.note_factor_key(1): note_factor_init_vals,
c.internalNoteInterceptKey: np.zeros(len(note_factor_init_vals))
})
userInit = pd.DataFrame({
c.raterParticipantIdKey: data_cols,
c.rater_factor_key(1): user_factor_init_vals,
c.internalRaterInterceptKey: np.zeros(len(user_factor_init_vals))
})
globalInterceptInit = subtracted_intercept
del data_matrix # save lots of memory if the data_matrix is numpy
# to further save memory, one could del data_df as soon as data_matrix is formed, but then would have to retrieve the ordering of ID's again when forming noteInit and userInit

self._initialize_parameters(noteInit, userInit, globalInterceptInit)

train_loss, loss, validate_loss = self._fit_model(validatePercent)

if useSpectralInit:
for _ in range(additonalSpectralInitIters):
data_df = self._form_data_df()
data_matrix = data_df.values
noteParams, raterParams = self._get_parameters_from_trained_model()
intercepts_matrix = np.add.outer(noteParams["internalNoteIntercept"].to_numpy(), raterParams["internalRaterIntercept"].to_numpy())
if self._useGlobalIntercept:
intercepts_matrix = intercepts_matrix + self.mf_model.global_intercept.item()
filled_matrix = np.where(np.isnan(data_matrix), intercepts_matrix, data_matrix)

U, S, Vt = svds(filled_matrix, k=self._numFactors)
note_factor_init_vals = np.sqrt(S[0]) * U.T[0]
user_factor_init_vals = np.sqrt(S[0]) * Vt[0]

noteInit = pd.DataFrame({
c.noteIdKey: data_df.index,
c.note_factor_key(1): note_factor_init_vals,
c.internalNoteInterceptKey: np.zeros(len(note_factor_init_vals))
})
userInit = pd.DataFrame({
c.raterParticipantIdKey: data_df.columns,
c.rater_factor_key(1): user_factor_init_vals,
c.internalRaterInterceptKey: np.zeros(len(user_factor_init_vals))
})
del data_df, data_matrix

self._initialize_parameters(noteInit, userInit, None)
train_loss, loss, validate_loss = self._fit_model(validatePercent)

if self._normalizedLossHyperparameters is not None:
_, raterParams = self._get_parameters_from_trained_model()
assert self.modelData is not None
Expand Down
Loading