1
1
import abc
2
2
import logging
3
- from datetime import datetime
4
- from pathlib import Path
5
- from typing import List
6
-
7
- from pyspark .errors import AnalysisException
8
- from pyspark .sql import DataFrame
9
- from pyspark .sql .types import StructType , StructField , TimestampType
10
-
11
- import listenbrainz_spark
12
- from listenbrainz_spark import hdfs_connection
13
- from listenbrainz_spark .config import HDFS_CLUSTER_URI
14
- from listenbrainz_spark .path import INCREMENTAL_DUMPS_SAVE_PATH , \
15
- LISTENBRAINZ_LISTENER_STATS_AGG_DIRECTORY , LISTENBRAINZ_LISTENER_STATS_BOOKKEEPING_DIRECTORY
16
- from listenbrainz_spark .stats import run_query
17
- from listenbrainz_spark .utils import read_files_from_HDFS , get_listens_from_dump
3
+ from datetime import date
4
+ from typing import Optional
18
5
6
+ from listenbrainz_spark .path import LISTENBRAINZ_LISTENER_STATS_DIRECTORY
7
+ from listenbrainz_spark .stats .incremental .user .entity import UserEntity
19
8
20
9
logger = logging .getLogger (__name__ )
21
- BOOKKEEPING_SCHEMA = StructType ([
22
- StructField ('from_date' , TimestampType (), nullable = False ),
23
- StructField ('to_date' , TimestampType (), nullable = False ),
24
- StructField ('created' , TimestampType (), nullable = False ),
25
- ])
26
-
27
10
28
- class EntityListener (abc .ABC ):
29
-
30
- def __init__ (self , entity ):
31
- self .entity = entity
32
-
33
- def get_existing_aggregate_path (self , stats_range ) -> str :
34
- return f"{ LISTENBRAINZ_LISTENER_STATS_AGG_DIRECTORY } /{ self .entity } /{ stats_range } "
35
11
36
- def get_bookkeeping_path (self , stats_range ) -> str :
37
- return f"{ LISTENBRAINZ_LISTENER_STATS_BOOKKEEPING_DIRECTORY } /{ self .entity } /{ stats_range } "
12
+ class EntityListener (UserEntity , abc .ABC ):
38
13
39
- def get_partial_aggregate_schema (self ) -> StructType :
40
- raise NotImplementedError ()
14
+ def __init__ (self , entity : str , stats_range : str , database : Optional [str ], message_type : Optional [str ]):
15
+ if not database :
16
+ database = f"{ self .entity } _listeners_{ self .stats_range } _{ date .today ().strftime ('%Y%m%d' )} "
17
+ super ().__init__ (entity , stats_range , database , message_type )
41
18
42
- def aggregate (self , table , cache_tables ) -> DataFrame :
43
- raise NotImplementedError ()
19
+ def get_table_prefix (self ) -> str :
20
+ return f" { self . entity } _listener_ { self . stats_range } "
44
21
45
- def filter_existing_aggregate (self , existing_aggregate , incremental_aggregate ) :
46
- raise NotImplementedError ()
22
+ def get_base_path (self ) -> str :
23
+ return LISTENBRAINZ_LISTENER_STATS_DIRECTORY
47
24
48
- def combine_aggregates (self , existing_aggregate , incremental_aggregate ) -> DataFrame :
25
+ def get_entity_id (self ) :
49
26
raise NotImplementedError ()
50
27
51
- def get_top_n (self , final_aggregate , N ) -> DataFrame :
52
- raise NotImplementedError ()
53
-
54
- def get_cache_tables (self ) -> List [str ]:
55
- raise NotImplementedError ()
56
-
57
- def generate_stats (self , stats_range : str , from_date : datetime ,
58
- to_date : datetime , top_entity_limit : int ):
59
- cache_tables = []
60
- for idx , df_path in enumerate (self .get_cache_tables ()):
61
- df_name = f"entity_data_cache_{ idx } "
62
- cache_tables .append (df_name )
63
- read_files_from_HDFS (df_path ).createOrReplaceTempView (df_name )
64
-
65
- metadata_path = self .get_bookkeeping_path (stats_range )
66
- try :
67
- metadata = listenbrainz_spark \
68
- .session \
69
- .read \
70
- .schema (BOOKKEEPING_SCHEMA ) \
71
- .json (f"{ HDFS_CLUSTER_URI } { metadata_path } " ) \
72
- .collect ()[0 ]
73
- existing_from_date , existing_to_date = metadata ["from_date" ], metadata ["to_date" ]
74
- existing_aggregate_usable = existing_from_date .date () == from_date .date ()
75
- except AnalysisException :
76
- existing_aggregate_usable = False
77
- logger .info ("Existing partial aggregate not found!" )
78
-
79
- prefix = f"entity_listener_{ self .entity } _{ stats_range } "
80
- existing_aggregate_path = self .get_existing_aggregate_path (stats_range )
81
-
82
- only_inc_entities = True
83
-
84
- if not hdfs_connection .client .status (existing_aggregate_path , strict = False ) or not existing_aggregate_usable :
85
- table = f"{ prefix } _full_listens"
86
- get_listens_from_dump (from_date , to_date , include_incremental = False ).createOrReplaceTempView (table )
87
-
88
- logger .info ("Creating partial aggregate from full dump listens" )
89
- hdfs_connection .client .makedirs (Path (existing_aggregate_path ).parent )
90
- full_df = self .aggregate (table , cache_tables )
91
- full_df .write .mode ("overwrite" ).parquet (existing_aggregate_path )
92
-
93
- hdfs_connection .client .makedirs (Path (metadata_path ).parent )
94
- metadata_df = listenbrainz_spark .session .createDataFrame (
95
- [(from_date , to_date , datetime .now ())],
96
- schema = BOOKKEEPING_SCHEMA
97
- )
98
- metadata_df .write .mode ("overwrite" ).json (metadata_path )
99
- only_inc_entities = False
100
-
101
- full_df = read_files_from_HDFS (existing_aggregate_path )
102
-
103
- if hdfs_connection .client .status (INCREMENTAL_DUMPS_SAVE_PATH , strict = False ):
104
- table = f"{ prefix } _incremental_listens"
105
- read_files_from_HDFS (INCREMENTAL_DUMPS_SAVE_PATH ) \
106
- .createOrReplaceTempView (table )
107
- inc_df = self .aggregate (table , cache_tables )
108
- else :
109
- inc_df = listenbrainz_spark .session .createDataFrame ([], schema = self .get_partial_aggregate_schema ())
110
- only_inc_entities = False
111
-
112
- full_table = f"{ prefix } _existing_aggregate"
113
- full_df .createOrReplaceTempView (full_table )
114
-
115
- inc_table = f"{ prefix } _incremental_aggregate"
116
- inc_df .createOrReplaceTempView (inc_table )
117
-
118
- if only_inc_entities :
119
- existing_table = f"{ prefix } _filtered_aggregate"
120
- filtered_aggregate_df = self .filter_existing_aggregate (full_table , inc_table )
121
- filtered_aggregate_df .createOrReplaceTempView (existing_table )
122
- else :
123
- existing_table = full_table
124
-
125
- combined_df = self .combine_aggregates (existing_table , inc_table )
126
-
127
- combined_table = f"{ prefix } _combined_aggregate"
128
- combined_df .createOrReplaceTempView (combined_table )
129
- results_df = self .get_top_n (combined_table , top_entity_limit )
28
+ def items_per_message (self ):
29
+ return 10000
130
30
131
- return only_inc_entities , results_df . toLocalIterator ()
132
-
31
+ def parse_one_user_stats ( self , entry : dict ):
32
+ raise entry
0 commit comments