Performing a Horizontal UNION ALL in SQL

48 Views Asked by At

I have this table (colors) in SQL:

CREATE TABLE colors (
    color1 VARCHAR(50),
    color2 VARCHAR(50),
    year INT,
    var1 INT,
    var2 INT
);


INSERT INTO colors (color1, color2, year, var1, var2) VALUES
('red', 'blue', 2010, 1, 2),
('blue', 'red', 2010, 0, 2),
('green', NULL, 2010, 3, 1),
('yellow', NULL, 2010, 2, 1),
('purple', 'black', 2010, 1, 1),
('red', NULL, 2011, 5, 5),
('green', 'blue', 2011, 3, 3),
('blue', 'green', 2011, 2, 3)
   ('white', 'orange', 2011, 2, 3);

color1 color2 year var1 var2
    red   blue 2010    1    2
   blue    red 2010    0    2
  green   <NA> 2010    3    1
 yellow   <NA> 2010    2    1
 purple  black 2010    1    1
    red   <NA> 2011    5    5
  green   blue 2011    3    3
   blue  green 2011    2    3
  white orange 2011    2    3

I am trying to accomplish the following task:

  • I want to create 4 new columns: color1_var1, color1_var2, color2_var1, color2_var2
  • If a pair of colors is found in the same year (e.g. red,blue, 2010 and blue, red, 2010), I want to update the values of color1_var1, color1_var2, color2_var1, color2_var2 with the corresponding information
  • If a pair of colors is not found in the same year (e.g green, null, 2010 or white, orange, 2011), then color2_var1 and color2_var2 will be left as NULL
  • I then want to only keep one unique row for each color combination in each year.

Here is what I tried so far:

First I used a self-join to create the new columns:

SELECT 
    a.color1 AS color1,
    a.color2 AS color2,
    a.year AS year,
    a.var1 AS color1_var1,
    a.var2 AS color1_var2,
    b.var1 AS color2_var1,
    b.var2 AS color2_var2
FROM 
    colors a
LEFT JOIN 
    colors b 
ON 
    a.year = b.year AND 
    ((a.color1 = b.color2 AND a.color2 = b.color1) OR 
     (a.color2 IS NULL AND b.color2 IS NULL AND a.color1 != b.color1));

 color1 color2 year color1_var1 color1_var2 color2_var1 color2_var2
    red   blue 2010           1           2           0           2
   blue    red 2010           0           2           1           2
  green   <NA> 2010           3           1           2           1
 yellow   <NA> 2010           2           1           3           1
 purple  black 2010           1           1          NA          NA
    red   <NA> 2011           5           5          NA          NA
  green   blue 2011           3           3           2           3
   blue  green 2011           2           3           3           3
  white orange 2011           2           3          NA          NA

But I am confused as to how I can keep only one occurrence of each duplicates (e.g. red/blue/2010 and blue/red/2010) from these results

I thought of a long way to do this:

WITH color_pairs AS (
    SELECT 
        a.color1 AS color1,
        a.color2 AS color2,
        a.year AS year,
        a.var1 AS color1_var1,
        a.var2 AS color1_var2,
        b.var1 AS color2_var1,
        b.var2 AS color2_var2
    FROM 
        colors a
    LEFT JOIN 
        colors b 
    ON 
        a.year = b.year AND 
        ((a.color1 = b.color2 AND a.color2 = b.color1) OR 
         (a.color2 IS NULL AND b.color2 IS NULL AND a.color1 != b.color1))
), 

ranked_colors AS (
    SELECT 
        *,
        ROW_NUMBER() OVER (
            PARTITION BY 
                CASE WHEN color1 < color2 THEN color1 ELSE color2 END, 
                CASE WHEN color1 < color2 THEN color2 ELSE color1 END, 
                year 
            ORDER BY year
        ) AS rn
    FROM 
        color_pairs
)

SELECT 
    *
FROM 
    ranked_colors
WHERE 
    rn = 1 OR color2 IS NULL;

I think this worked:

 color1 color2 year color1_var1 color1_var2 color2_var1 color2_var2 rn
  green   <NA> 2010           3           1           2           1  1
    red   <NA> 2011           5           5          NA          NA  1
 yellow   <NA> 2010           2           1           3           1  1
 purple  black 2010           1           1          NA          NA  1
  green   blue 2011           3           3           2           3  1
    red   blue 2010           1           2           0           2  1
  white orange 2011           2           3          NA          NA  1

Is the correct way to do it?

1

There are 1 best solutions below

1
Luty On

Update the new columns:

UPDATE c1
SET c1.color1_var1 = c2.var1,
    c1.color1_var2 = c2.var2,
    c1.color2_var1 = c3.var1,
    c1.color2_var2 = c3.var2
FROM colors c1
LEFT JOIN colors c2 ON c1.color1 = c2.color1 AND c1.year = c2.year
LEFT JOIN colors c3 ON c1.color2 = c3.color1 AND c1.year = c3.year

Then create a new table:

SELECT DISTINCT 
  color1, color2, year, var1, var2, 
  color1_var1, color1_var2, color2_var1, color2_var2
INTO colors_temp
FROM colors

The new table colors_temp contains what you looking for. You can drop the colors table and then rename the colors_temp to colors:

DROP TABLE colors;

EXEC sp_rename 'colors_temp', 'colors';