Bagaimana cara menggabungkan catatan serupa dengan tanggal validitas yang berbeda?


10

Tabel yang saya kerjakan memiliki tiga komponen:

  1. Sebuah IDkolom (primary key pada tabel lain)
  2. Beberapa kolom data
  3. Tanggal valid from/ tokolom.

Nilai:

ID   Data From        To  
1    a    2015-01-01  2015-01-05
1    a    2015-01-06  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-05
2    c    2015-01-06  2015-01-10

Tabel diperbarui dengan mengambil "snapshot" dari sumber data lain pada beberapa interval dan menetapkan tanggal validitas ke catatan. Masalahnya adalah snapshot ini membuat entri duplikat untuk catatan (dengan tanggal validitas berbeda) yang tidak berubah sama sekali selama interval itu.

Saya ingin mengurangi ukuran tabel dengan mencari baris dengan tanggal berurutan dan dengan menggabungkan mereka dan menugaskan mereka periode validitas tunggal. Sebagai contoh:

ID   Data From        To  
1    a    2015-01-01  2015-01-10
1    b    2015-01-11  2015-01-15
1    a    2015-01-16  2015-01-20
2    c    2015-01-01  2015-01-10

Logika yang saya miliki saat ini adalah:

  1. Pilih dan urutkan semua baris berdasarkan ID, bidang data, dan bidang 'valid dari' (sehingga mereka berada dalam kelompok baris berturut-turut).
  2. Gunakan kursor untuk membandingkan baris yang berdekatan untuk kesamaan.
  3. Jika sama, maka gabungkan baris dan ubah periode validitas untuk memasukkan kedua baris tersebut.

Saya mengerti bahwa kursor sangat tidak efisien (saya punya dataset besar), jadi saya mencari pendekatan lain.


2
Juga: tambahkan CREATE TABLEpernyataan dalam pertanyaan.
ypercubeᵀᴹ

2
Seberapa besar dataset "besar"? Mengapa Anda tidak memperbaiki impor snapshot sehingga tidak menciptakan masalah sejak awal?
Paul White 9

Dalam urutan jutaan rekaman. Saya tidak memiliki izin untuk mengubah cara tabel dibuat. Plus itu tidak menyelesaikan masalah dengan catatan masa lalu.
hazrmard

Jawaban:


8

Jika ini hanya tabel rentang belakang-ke-belakang saja, kasing Anda dapat diperlakukan sebagai masalah klasik "celah dan pulau", di mana Anda hanya perlu mengisolasi pulau dengan rentang berurutan dan kemudian "mengembunkan" dengan mengambil minimum [from]dan maksimum [to]per pulau.

Ada metode yang mapan untuk memecahkan masalah ini menggunakan dua panggilan ROW_NUMBER:

WITH islands AS
(
  SELECT
    id,
    data,
    [from],
    [to],
    island = ROW_NUMBER() OVER (PARTITION BY id       ORDER BY [from])
           - ROW_NUMBER() OVER (PARTITION BY id, data ORDER BY [from])
  FROM
    #mergeTest
)
SELECT
  id,
  data,
  [from] = MIN([from]),
  [to]   = MAX([to])
FROM
  islands
GROUP BY
  id,
  data,
  island
;

Kueri ini akan berfungsi dalam versi serendah SQL Server 2005.


1

Saya bisa menulis kueri untuk mengatasi masalah ini. Ini menggunakan beberapa gabungan dan loop sementara untuk menggabungkan catatan. Kode ini kompatibel dengan SQL Server 2008 R2.

CREATE TABLE #mergeTest
(
    [id] int NOT NULL,
    [data] date,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO #mergeTest ([id],[data],[from],[to]) VALUES     --testing null data value handling
    (1,NULL,'2015-01-01','2015-01-05'), --1
    (1,NULL,'2015-01-05','2015-01-10'), --2
    (1,'2000-01-01','2015-01-10','2015-01-14'), --3
    (1,'2000-01-03','2015-01-14','2015-01-15'), --4
    (1,'2000-01-01','2015-01-15','2015-01-20'), --5
    (1,'2000-01-01','2015-01-20','2015-01-22'), --5
    (1,'2000-01-01','2015-01-22','2015-01-25'), --6
    (1,'2000-01-01','2015-01-25','2015-01-30'), --7
    (1,NULL,'2015-01-30','2015-02-04'), --8
    (2,'2000-01-05','2015-01-01','2015-01-05'), --9
    (2,'2000-01-05','2015-01-05','2015-01-10')  --10

SELECT * FROM #mergeTest 
GO
;

SELECT * INTO #tempSingle                               --isolate single records. Single records need no processing.
    FROM (
        SELECT  [id], [data], MIN([from]) as [from], MIN([to]) as [to],
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]=1;
ALTER TABLE #tempSingle
    DROP COLUMN [grpsz];
GO
;

SELECT * INTO #tempRemainingtemp                        --isolate records w/ more than 2 entries. They need to be reduced to single records
    FROM (
        SELECT  [id], [data],                           --get [id] and [data] of duplicate records
                COUNT([id]) as [grpsz]
        FROM #mergeTest
        GROUP BY [id], [data]) AS [selection]
    WHERE [grpsz]>=2;
ALTER TABLE #tempRemainingTemp
    DROP COLUMN [grpsz]
SELECT * FROM #tempRemainingtemp
SELECT * INTO #temp                                     --get all duplicate records into #temp
    FROM (
        SELECT [b].*
        FROM #tempRemainingtemp AS [a]
        JOIN #mergeTest AS [b]
        ON      [a].[id]=[b].[id]
            AND ([a].[data]=[b].[data] OR [a].[data] IS NULL AND [b].[data] IS NULL)) AS [selection];

DROP TABLE #tempRemainingtemp;
Go
SELECT * INTO #tempRemaining
    FROM #temp;
DROP TABLE #temp;
GO
;
SELECT * FROM #tempRemaining
BEGIN
SELECT t1.*, t2.[from] as [prevfrom] INTO #temp0        --filter in records where previous 'to' date matched current 'from' date when grouped by id and data
    FROM #tempRemaining AS t1
    JOIN #tempRemaining AS t2
    ON      t2.[to] = t1.[from]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)

SELECT t1.*, t2.[prevfrom] INTO #temp1                  --add records that did not have a previous 'to' date b/c they were the extreme records in their group
    FROM #tempRemaining AS t1
    LEFT JOIN #temp0 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp0;

SELECT t1.*, t2.[to] as [nextto] INTO #temp2            --filter in records where current 'to' date matched next 'from' date when grouped by id and data
    FROM #temp1 AS t1
    JOIN #temp1 AS t2
    ON      t2.[from] = t1.[to]
        AND t1.[id] = t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL);

SELECT t1.*, t2.[nextto] INTO #temp                     --add records that did not have a next 'from' date b/c they were the extreme records in their group
    FROM #temp1 AS t1
    LEFT JOIN #temp2 AS t2
    ON      t1.[id]=t2.[id]
        AND ([t1].[data]=[t2].[data] OR [t1].[data] IS NULL AND [t2].[data] IS NULL)
        AND t1.[from] = t2.[from];

DROP TABLE #temp2;
DROP TABLE #temp1;

DELETE FROM #temp                                       --delete redundant records
    WHERE   [prevfrom] IS NOT NULL
        AND [nextto] IS NOT NULL;

WITH cte AS (                                           --select records that got reduced to singles and insert them into singles account
    SELECT [id], [data], [from], [to]
        FROM [#temp]
        WHERE   [prevfrom] IS NULL
            AND [nextto] IS NULL)
DELETE FROM cte
OUTPUT deleted.* INTO #tempSingle

/* ALL DUPLICATE RECORDS ARE NOW REDUCED TO PAIRS*/

SELECT * FROM #temp;
ALTER TABLE #temp
    DROP COLUMN [nextto],[prevfrom]                     --remove helper columns
END

SELECT TOP 1 * INTO #temptemp                           --create temporary tables for storage
    FROM #temp
SELECT TOP 1 * INTO #tempResult
    FROM #temp
TRUNCATE TABLE #temptemp
TRUNCATE TABLE #tempResult

WHILE EXISTS(SELECT [id] from #temp)
BEGIN
    WITH cte AS (
            SELECT TOP 2 *                              --select pair
                FROM #temp
                ORDER BY [id],[data],[from])
        DELETE FROM cte                                 --delete from original table
        OUTPUT deleted.* INTO #temptemp;
    INSERT INTO #tempResult                             --insert merged record into result table
        SELECT t1.[id], t1.[data], t1.[from], t2.[to]
        FROM #temptemp AS t1
        JOIN #temptemp AS t2
        ON t1.[from]<t2.[from];
    TRUNCATE TABLE #temptemp;                           --empty temporary storage table
END;

TRUNCATE TABLE #mergeTest;                              --insert single records and merged records into original table
INSERT INTO #mergeTest
    SELECT * FROM #tempResult;
INSERT INTO #mergeTest
    SELECT * FROM #tempSingle;

SELECT * FROM #mergeTest
    ORDER BY [id],[from];

0

Hanya untuk kasus di mana Anda memiliki rentang tanggal yang tidak berdekatan yang, meskipun berturut-turut, harus tetap terpisah, saya datang dengan solusi ini:

Lihat di SQL Fiddle

WITH lag_info AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    lag([To], 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevTo,
    lag(Data, 1, NULL) OVER (PARTITION BY ID ORDER BY [From]) AS PrevData
  FROM dat
),
segmented AS (
  SELECT
    ID,
    Data,
    [From],
    [To],
    -- new interval if non-contigous or data changed
    -- if it's null, it means that it's the first entry for the ID, which means it's a new interval
    CASE
      WHEN [PrevTo] IS NULL
        OR PrevData IS NULL
        OR DATEDIFF(DAY, [PrevTo], [From]) > 1
        OR Data <> PrevData
      THEN 1
      ELSE 0
    END AS is_new_interval
  FROM lag_info
),
segmented_marked AS (
  SELECT
    ID,
    [From],
    [To],
    Data,
    -- increment only when new data is detected, using a running sum
    sum(s.is_new_interval)
      OVER (PARTITION BY ID ORDER BY [From] ROWS BETWEEN UNBOUNDED PRECEDING AND 0 FOLLOWING)
                                AS interval_id
  FROM segmented s
)
SELECT
  ID,
  min([From]) AS [From],
  max([To]) AS [To],
  Data
FROM segmented_marked
GROUP BY ID, Data, interval_id

-1

Saya menulis kueri yang sepertinya berfungsi. Menggunakan Common Table Expressions, pernyataan MERGE dan fungsi Analytic. Namun itu hanya kompatibel dengan SQL server 2012+. Anda dapat menemukan intinya di sini: MergeRecordsByValidityDate.sql

/*  NOTE: Only works w/ SQL Server 2012+
    Merging identical records with different validity dates.
*/
USE [master]


IF OBJECT_ID('mergeTest') IS NOT NULL
    DROP TABLE mergeTest

CREATE TABLE mergeTest          -- Create table with test data
(
    [id] int NOT NULL,
    [data] char(1) NOT NULL,
    [from] date NOT NULL,
    [to] date NOT NULL
);

INSERT INTO mergeTest ([id],[data],[from],[to]) VALUES      -- Insert records w/ different validity dates
    (1,'a','2015-01-01','2015-01-05'),  --1
    (1,'a','2015-01-05','2015-01-10'),  --2
    (1,'a','2015-01-10','2015-01-14'),  --3
    (1,'b','2015-01-14','2015-01-15'),  --4
    (1,'a','2015-01-15','2015-01-20'),  --5
    (1,'a','2015-01-20','2015-01-25'),  --6
    (1,'a','2015-01-25','2015-01-30'),  --7
    (1,'a','2015-01-30','2015-02-04'),  --8
    (2,'c','2015-01-01','2015-01-05'),  --9
    (2,'c','2015-01-05','2015-01-10')   --10

SELECT * FROM mergeTest

/*  This SELECT function uses a Common Table Expression along with Analytic functions over a partition.
    The data set is partitioned on similar primary key and data columns and ordered by 'from' dates.
    A 'last' and 'next' column is added with 'to' date of prev row and 'from' date of next row.
    For each partition, rows are selected (for each partition) that represent the first and last records 
    of identical data. For e.g. rows 5,6,7,8 are reduced to 5,8.
*/

;WITH partitionedData AS (
    SELECT *,   LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM mergeTest)
SELECT [id],[data],[from],[to],[last],[next] INTO #temp
    FROM partitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temp

/*  Now all redundant 'sandwiched' records have been filtered out, only the extreme records are left.
    This MERGE function matches rows on primary key and data, and If the 'to' date of said record matches
    'from' date of another similar record, then the said record is extended to encapsulate the other record's
    'to' date. For example row 5's 'to' date is extended to equal row 8's 'to' date.
*/

MERGE INTO #temp as m1
    USING #temp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[to]=m2.[from])
    THEN
    UPDATE SET  m1.[to]=m2.[to]
;

SELECT * FROM #temp

/*  The MERGE function has done its job of extending records. However there are still 2 records with
    identical data. For e.g. rows 9,10 exist even though row 9 now has all the required information. This 
    block modifies such redundant rows so their 'last' and 'from' columns become asynchronous.
*/

;WITH repartitionedData AS (
    SELECT [id],[data],[from],[to], LAG([to],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [last],
                LEAD([from],1,NULL) OVER(PARTITION BY [id],[data] ORDER BY [from]) AS [next]
    FROM #temp)
SELECT [id],[data],[from],[to],[last],[next] INTO #temptemp
    FROM repartitionedData
    WHERE [last] IS NULL OR [next] IS NULL OR [last]<>[from] OR [next]<>[to]
;

SELECT * FROM #temptemp

/* Asynchronous rows are deleted
*/

DELETE FROM #temptemp
    WHERE [from]<[last]

SELECT * FROM #temptemp

/*  However, blocks of data with >2 rows (like rows 5 through 8) could not be merged because of the filtered out
    rows (i.e. rows 6,7). Applying MERGE again on the updated data set.
*/

MERGE INTO #temptemp as m1
    USING #temptemp as m2
    ON m1.id=m2.id AND m1.data=m2.data
WHEN MATCHED
    AND (m1.[from]=m2.[next])
    THEN
    UPDATE SET  m1.[from]=m2.[from],
                m1.[last]=CASE WHEN ((m2.[last] IS NULL) OR (m2.[next] IS NULL)) THEN NULL ELSE m1.[last] END   --if row absorbing from is extreme, then current row is also extreme
;

SELECT * FROM #temptemp

TRUNCATE TABLE mergeTest        -- resetting original table

/* The MERGE corrected all rows with the correct 'from' and 'to' dates. And the only rows we are interested in are
    the extreme rows i.e. with 'last' or 'next' == NULL. SELECTing on that criterion and INSERTing into original table.
*/

INSERT INTO mergeTest           -- inserting processed records into table + some last minute filtering
    SELECT [id],[data],[from],MAX([to])
    FROM #temptemp
        WHERE [next] IS NULL OR [last] IS NULL
    GROUP BY [id],[data],[from]

SELECT * FROM mergeTest

DROP TABLE #temp
DROP TABLE #temptemp
Dengan menggunakan situs kami, Anda mengakui telah membaca dan memahami Kebijakan Cookie dan Kebijakan Privasi kami.
Licensed under cc by-sa 3.0 with attribution required.