Thursday, March 3, 2016

Delete washes in accounting data

--******************************************************************
--******************************************************************
--******************************************************************
--******************************************************************
-- Algorithm by Carlos Klapp
--******************************************************************
--******************************************************************
--******************************************************************
--******************************************************************
/*

This algorithm splits the data into positive and negative values. 
    It then finds the rows in the same position and flags them as a 
    wash. If the row does not have a matching pair then it is a
    unique transaction and is NOT a wash.

|             Positive values         |             |             Negative values         |
| AccountId | TransactionDate | Value |             | AccountId | TransactionDate | Value |
|         1 |      2016/01/01 |    $7 |     <->     |         1 |      2016/01/01 |   -$7 | <- can be deleted
|         1 |      2016/02/01 |    $7 |     <->     |         1 |      2016/06/01 |   -$7 | <- can be deleted
|         1 |      2016/04/01 |    $7 |     <->     |         1 |      2016/07/01 |   -$7 | <- can be deleted
|           |                 |       |             |         1 |      2016/09/01 |   -$7 | <- keep this row
|         2 |      2016/05/01 |    $8 |     <->     |         2 |      2016/11/01 |   -$8 | <- can be deleted
|           |                 |       |             |         2 |      2016/12/01 |   -$8 | <- keep this row


*/





--******************************************************************
--******************************************************************
-- Build a tally table
--******************************************************************
--******************************************************************
IF OBJECT_ID('dbo.Numbers') IS NOT NULL
    DROP TABLE dbo.Numbers
    
SELECT TOP 10000 N=IDENTITY(INT, 1, 1)
INTO dbo.Numbers
FROM master.dbo.syscolumns a CROSS JOIN master.dbo.syscolumns  b;

ALTER TABLE dbo.Numbers ADD CONSTRAINT NBR_pk PRIMARY KEY(N);


--******************************************************************
--******************************************************************
-- Create the table that will hold our test data
--******************************************************************
--******************************************************************
IF OBJECT_ID('dbo.WashValues') IS NOT NULL
    DROP TABLE dbo.WashValues

CREATE TABLE dbo.WashValues(
    RowId int IDENTITY(1,1) PRIMARY KEY
    ,AccountId INT NOT NULL
    ,TransactionDate datetime NOT NULL
    ,Value MONEY NOT NULL
    ,AbsValue AS CAST(ISNULL(abs(Value), 0) AS MONEY) PERSISTED
    ,IsWash BIT NOT NULL DEFAULT(0)
)


--******************************************************************
--******************************************************************
-- Create our sample data
--******************************************************************
--******************************************************************
INSERT INTO dbo.WashValues(AccountId, TransactionDate, Value)
SELECT
    CAST(CRYPT_GEN_RANDOM(1) AS INT) % 10 + 1 AS AccountId
    ,DATEADD(DAY, n.N, GETDATE()) AS TransactionDate
    ,CAST(CRYPT_GEN_RANDOM(1) AS INT) % 10 * IIF(CAST(CRYPT_GEN_RANDOM(1) AS INT) % 2 = 0, 1, -1) AS Value
FROM
    dbo.Numbers n
WHERE
    n.N BETWEEN 1 AND 10000


--******************************************************************
-- Insert unique case: single positive row without a wash
--******************************************************************
INSERT INTO dbo.WashValues(AccountId, TransactionDate, Value)
SELECT
    999 AS AccountId
    ,GETDATE() AS TransactionDate
    ,999 AS Value


--******************************************************************
-- Insert unique case: single negative row without a wash
--******************************************************************
INSERT INTO dbo.WashValues(AccountId, TransactionDate, Value)
SELECT
    888 AS AccountId
    ,GETDATE() AS TransactionDate
    ,-888 AS Value



--******************************************************************
-- Delete the zero money rows. They sum to zero, so we can ignore them.
--******************************************************************
DELETE from dbo.WashValues WHERE AbsValue = 0


--******************************************************************
--******************************************************************
--******************************************************************
--******************************************************************
-- Split the data into a positive and negative list. In this
--  example we order the lists by the TransactionDate in ascending
--  order. This essentially gives us a FIFO (First In First Out)
--  list.
--******************************************************************
--******************************************************************
--******************************************************************
--******************************************************************

--******************************************************************
-- Build negative list
--******************************************************************
IF OBJECT_ID('dbo.WashValues_Negative') IS NOT NULL
    DROP TABLE dbo.WashValues_Negative

SELECT
    RowId
    ,AccountId
    ,AbsValue
    ,ROW_NUMBER() OVER(PARTITION BY AccountId, Value ORDER BY TransactionDate) AS SequenceId
INTO dbo.WashValues_Negative
FROM 
    dbo.WashValues
WHERE
    Value < 0

    
--******************************************************************
-- Build negative list
--******************************************************************
IF OBJECT_ID('dbo.WashValues_Positive') IS NOT NULL
    DROP TABLE dbo.WashValues_Positive

SELECT
    RowId
    ,AccountId
    ,AbsValue
    ,ROW_NUMBER() OVER(PARTITION BY AccountId, Value ORDER BY TransactionDate) AS SequenceId
INTO dbo.WashValues_Positive
FROM 
    dbo.WashValues
WHERE
    Value > 0



--******************************************************************
--******************************************************************
-- Locate the equivalent rows and mark them as a wash.
--******************************************************************
--******************************************************************
UPDATE w
SET
    w.IsWash = 1
FROM
    dbo.WashValues w
    JOIN (
        SELECT 
            p.RowId AS n_RowId
            ,n.RowId AS p_RowId
        FROM 
            dbo.WashValues_Positive p
            JOIN dbo.WashValues_Negative n
                ON p.AccountId = n.AccountId
                    AND n.AbsValue = p.AbsValue
                    AND n.SequenceId = p.SequenceId
    ) q
        ON w.RowId IN (q.n_RowId, q.p_RowId)



--******************************************************************
--******************************************************************
-- Debug: Inspect the data and make sure the rows are
--  correctly paired.
--******************************************************************
--******************************************************************
SELECT 
    pw.RowId
   ,pw.AccountId
   ,pw.TransactionDate
   ,pw.Value
   ,pw.AbsValue
   ,pw.IsWash
   ,p.SequenceId
   ,nw.RowId
   ,nw.AccountId
   ,nw.TransactionDate
   ,nw.Value
   ,nw.AbsValue
   ,nw.IsWash
   ,n.SequenceId
FROM 
    dbo.WashValues_Positive p
    JOIN dbo.WashValues_Negative n
        ON p.AccountId = n.AccountId
            AND n.AbsValue = p.AbsValue
            AND n.SequenceId = p.SequenceId
    JOIN dbo.WashValues pw
        ON pw.RowId = p.RowId
    JOIN dbo.WashValues nw
        ON nw.RowId = n.RowId
GROUP BY
    pw.RowId
   ,pw.AccountId
   ,pw.TransactionDate
   ,pw.Value
   ,pw.AbsValue
   ,pw.IsWash
   ,p.SequenceId
   ,nw.RowId
   ,nw.AccountId
   ,nw.TransactionDate
   ,nw.Value
   ,nw.AbsValue
   ,nw.IsWash
   ,n.SequenceId
ORDER BY
    pw.AccountId
    ,pw.AbsValue
    ,p.SequenceId

    


--******************************************************************
--******************************************************************
-- Unit Test: The grand total by AccountId must always be the same.
--******************************************************************
--******************************************************************
IF EXISTS(
        SELECT
            *
        FROM
            (
                SELECT
                    AccountId
                    ,SUM(Value) AS Total
                    ,COUNT(*) AS NumRows
                FROM
                    dbo.WashValues
                WHERE
                    IsWash = 0
                GROUP BY
                    AccountId
            ) q1
            FULL OUTER JOIN (
                SELECT
                    AccountId
                    ,SUM(Value) AS Total
                    ,COUNT(*) AS NumRows
                FROM
                    dbo.WashValues
                GROUP BY
                    AccountId
            ) q2
                ON q2.AccountId = q1.AccountId
                    AND q2.Total = q1.Total
        where  
            q1.AccountId IS NULL
            OR q2.AccountId IS null
    )
BEGIN
    RAISERROR('Bug in code. Total do not match', 18, 0)
end