RedShift GROUP BY Constant Column gives inconsistent results

I was wondering if anyone could explain why using GROUP BYin a constant column, which is VARCHARagainst one, which INTEGERgives different types of behavior.

Here is my minimal working example. The following table subtracts the real data for which I found this problem:

CREATE TABLE test.show_bug AS

WITH integers AS (
    SELECT 0 AS num
    UNION SELECT 1 AS num
    UNION SELECT 2 AS num
    UNION SELECT 3 AS num
    UNION SELECT 4 AS num
    UNION SELECT 5 AS num
)
SELECT 
    '2017-03-16' + mod(a.num, 2) AS date_time 
    , CASE mod(b.num, 3)
        WHEN 0 THEN 'source_a'
        WHEN 1 THEN 'source_b'
        WHEN 2 THEN 'source_c'
        END AS user_source
    , b.num || a.num || b.num || a.num || b.num AS user_id
FROM integers AS a
CROSS JOIN integers AS b
;

It looks like this:

 date_time  | user_source | user_id
------------+-------------+---------
 2017-03-17 | source_a    | 3113313
 2017-03-17 | source_b    | 4114414
 2017-03-17 | source_b    | 1111111
 2017-03-16 | source_a    | 0000000
 2017-03-16 | source_c    | 2442242
 2017-03-16 | source_c    | 5225525
....
(36 rows)

Essentially, I want to be able COUNT(for every day) the number of users, the number of sources and the number of users per source. However, I have two different tables of the same format from which I want to UNIONget the results together. I can differentiate these results by adding a constant column to them: app_1and app_2.

, , sql :

SELECT 
    'app_1' AS app
    , date_time
    , COUNT(user_source)
    , COUNT(DISTINCT user_source)
    , COUNT(DISTINCT user_id)
FROM test.show_bug
GROUP BY 1, 2

UNION

SELECT 
    'app_2' AS app
    , date_time
    , COUNT(user_source)
    , COUNT(DISTINCT user_source)
    , COUNT(DISTINCT user_id)
FROM test.show_bug
GROUP BY 1, 2

   app    |     date_trunc      | count | count1 | count2
----------+---------------------+-------+--------+--------
 app_1    | 2017-03-16 00:00:00 |     2 |      1 |      0
 app_1    | 2017-03-17 00:00:00 |     2 |      1 |      0
 app_1    | 2017-03-19 00:00:00 |     5 |      0 |      1
 app_2    | 2017-03-19 00:00:00 |     7 |      1 |      0
 app_1    | 2017-03-16 00:00:00 |     0 |      1 |      0
....
(112 rows)

, , , - , 1 2 VARCHAR app_1 app_2, .. :

SELECT 
    1 AS app
    , date_time
    , COUNT(user_source)
    , COUNT(DISTINCT user_source)
    , COUNT(DISTINCT user_id)
FROM test.show_bug
GROUP BY 1, 2

UNION

SELECT 
    2 AS app
    , date_time
    , COUNT(user_source)
    , COUNT(DISTINCT user_source)
    , COUNT(DISTINCT user_id)
FROM test.show_bug
GROUP BY 1, 2

:

   app    |     date_trunc      | count | count1 | count2
----------+---------------------+-------+--------+--------
        1 | 2017-03-16 00:00:00 |   192 |     16 |    192
        1 | 2017-03-17 00:00:00 |   208 |     14 |    208
        1 | 2017-03-18 00:00:00 |   203 |     14 |    203
        1 | 2017-03-19 00:00:00 |   203 |     14 |    203
        1 | 2017-03-20 00:00:00 |    35 |      0 |     35
        2 | 2017-03-16 00:00:00 |   192 |     16 |    192
        2 | 2017-03-17 00:00:00 |   208 |     14 |    208
        2 | 2017-03-18 00:00:00 |   203 |     14 |    203
        2 | 2017-03-19 00:00:00 |   203 |     14 |    203
        2 | 2017-03-20 00:00:00 |    35 |      0 |     35

, UNION.

, , , , -, VARCHAR INTEGER.

- , , .

+4
2

, Amazon Redshift , , , .

:

  • a VARCHAR GROUP BY
  • COUNT(DISTINCT)

, :

SELECT 
    '1',
    COUNT(DISTINCT user_source),
    COUNT(DISTINCT user_id)
FROM show_bug
GROUP BY 1

:

SELECT 
    '1'::INTEGER,
    COUNT(DISTINCT user_source),
    COUNT(DISTINCT user_id)
FROM show_bug
GROUP BY 1

COUNT(DISTINCT) .

AWS, . , AWS, .

+3

, . UNION ALL , .

CHAR 4 :

SELECT
    'app_1'::CHAR(5) AS app
    , date_time
    , COUNT(user_source)
    , COUNT(DISTINCT user_source)
    , COUNT(DISTINCT user_id)
FROM test.show_bug
GROUP BY 1, 2
UNION
SELECT
    'app_2'::CHAR(5) AS app
    , date_time
    , COUNT(user_source)
    , COUNT(DISTINCT user_source)
    , COUNT(DISTINCT user_id)
FROM test.show_bug
GROUP BY 1, 2

  app  | date_time  | count | count1 | count2
-------+------------+-------+--------+--------
 app_2 | 2017-03-16 |    18 |      3 |     18
 app_1 | 2017-03-17 |    18 |      3 |     18
 app_1 | 2017-03-16 |    18 |      3 |     18
 app_2 | 2017-03-17 |    18 |      3 |     18
(4 rows)

VARCHAR :

SELECT
    'app_1'::VARCHAR(10) AS app
    , date_time
    , COUNT(user_source)
    , COUNT(DISTINCT user_source)
    , COUNT(DISTINCT user_id)
FROM test.show_bug
GROUP BY 1, 2
UNION
SELECT
    'app_2'::VARCHAR(10) AS app
    , date_time
    , COUNT(user_source)
    , COUNT(DISTINCT user_source)
    , COUNT(DISTINCT user_id)
FROM test.show_bug
GROUP BY 1, 2

  app  | date_time  | count | count1 | count2
-------+------------+-------+--------+--------
 app_1 | 2017-03-16 |     3 |      1 |      0
 app_1 | 2017-03-17 |     3 |      1 |      0
 app_2 | 2017-03-17 |     0 |      1 |      0
 app_2 | 2017-03-16 |     3 |      1 |      0
 app_2 | 2017-03-17 |     0 |      0 |      1
 app_1 | 2017-03-16 |     0 |      0 |      1
 app_2 | 2017-03-16 |     0 |      0 |      1
 app_1 | 2017-03-17 |     0 |      1 |      0
 app_2 | 2017-03-16 |     0 |      1 |      0
 app_1 | 2017-03-16 |     0 |      1 |      0
 app_1 | 2017-03-17 |     0 |      0 |      1
 app_2 | 2017-03-17 |     3 |      1 |      0
(12 rows)

INT , CHAR .

, . , , .

0

Source: https://habr.com/ru/post/1672769/


All Articles