Skip to content

Commit 1d8aac5

Browse files
committed
feat sgm-gharchive - Add gharchive .gz --> .parquet
Also add windows support as this runs well on a Threadripper. Signed-off-by: Matt Young <[email protected]>
1 parent af50da3 commit 1d8aac5

5 files changed

+337
-35
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
@echo off
2+
3+
echo ***
4+
echo *** BEGIN
5+
echo ***
6+
7+
set "start=%time%"
8+
9+
REM Get command line arguments
10+
set "source=%~1"
11+
set "target=%~2"
12+
set "orgfile=%~3"
13+
set "logs=%~4"
14+
set "pylogs=%~5"
15+
set "workers=%~6"
16+
17+
if "%logs%"=="" (
18+
set "logs=%target%\gharchive-gz-hour2day-%source:~0,8%.csv"
19+
)
20+
21+
if "%pylogs%"=="" (
22+
set "pylogs=%target%\gharchive-gz-hour2day-pylog-%source:~0,8%.log"
23+
)
24+
25+
if "%workers%"=="" (
26+
set workers=55
27+
)
28+
29+
python gharchive-gz-hour2day.py --source %source% --target %target% --org-file %orgfile% --log-results %logs% --verbose --workers %workers% > %pylogs% 2>&1
30+
31+
set "stop=%time%"
32+
set /a "secs=1%stop:~6,2% - 1%start:~6,2% + (1%stop:~3,2% - 1%start:~3,2%) * 60 + (1%stop:~0,2% - 1%start:~0,2%) / 60"
33+
echo Duration: %secs% seconds
34+
35+
@echo off
36+
echo ***
37+
echo *** END
38+
echo ***
39+
40+
REM Help text
41+
if "%~1"=="/?" goto help
42+
exit /b
43+
44+
:help
45+
echo Usage:
46+
echo %0 source target orgfile [logs] [pylogs] [workers]
47+
echo Required:
48+
echo source: Path to source data
49+
echo target: Path for output data
50+
echo orgfile: Path to org list file
51+
echo Optional:
52+
echo logs: Path to save results log CSV (default is gharchive-gz-hour2day-source-target.csv)
53+
echo pylogs: Path to save python output log (default is gharchive-gz-hour2day-pylog-source-target.log)
54+
echo workers: Number of workers (default is 55)
+69-30
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,93 @@
11
@echo off
22

33
echo ***
4-
echo *** BEGIN
4+
echo *** BEGIN: GHArchive GZ to Parquet Conversion
5+
echo *** Started at: %date% %time%
56
echo ***
67

78
set "start=%time%"
89

9-
REM Get command line arguments
10+
REM Validate required arguments
11+
if "%~1"=="" goto help
12+
if "%~2"=="" goto help
13+
if "%~3"=="" goto help
14+
15+
REM Get command line arguments
1016
set "source=%~1"
1117
set "target=%~2"
1218
set "orgfile=%~3"
13-
set "logs=%~4"
14-
set "pylogs=%~5"
15-
set "workers=%~6"
1619

17-
if "%logs%"=="" (
18-
set "logs=%target%\gharchive-gz-hour2day-%source:~0,8%.csv"
20+
REM Set optional arguments with defaults
21+
if "%~4"=="" (
22+
for %%I in ("%source%") do set "sourcename=%%~nxI"
23+
set "logs=%target%\gharchive-gz-hour2day-%sourcename%.csv"
24+
) else (
25+
set "logs=%~4"
1926
)
2027

21-
if "%pylogs%"=="" (
22-
set "pylogs=%target%\gharchive-gz-hour2day-pylog-%source:~0,8%.log"
28+
if "%~5"=="" (
29+
for %%I in ("%source%") do set "sourcename=%%~nxI"
30+
set "pylogs=%target%\gharchive-gz-to-parquet-pylog-%sourcename%.log"
31+
) else (
32+
set "pylogs=%~5"
2333
)
2434

25-
if "%workers%"=="" (
26-
set workers=55
35+
if "%~6"=="" (
36+
set workers=55
37+
) else (
38+
set "workers=%~6"
2739
)
2840

29-
python gharchive-gz-hour2day.py --source %source% --target %target% --org-file %orgfile% --log-results %logs% --verbose --workers %workers% > %pylogs% 2>&1
41+
REM Run conversion
42+
echo Running conversion with:
43+
echo Source: %source%
44+
echo Target: %target%
45+
echo Org File: %orgfile%
46+
echo Logs: %logs%
47+
echo Python Logs: %pylogs%
48+
echo Workers: %workers%
49+
echo.
3050

31-
set "stop=%time%"
32-
set /a "secs=1%stop:~6,2% - 1%start:~6,2% + (1%stop:~3,2% - 1%start:~3,2%) * 60 + (1%stop:~0,2% - 1%start:~0,2%) / 60"
33-
echo Duration: %secs% seconds
51+
if not exist "%target%" (
52+
echo Creating target directory: %target%
53+
mkdir "%target%"
54+
)
3455

35-
@echo off
36-
echo ***
37-
echo *** END
38-
echo ***
56+
REM Remove the redirection to allow console output while still logging
57+
python gharchive-gz-to-parquet.py --source %source% --target %target% --org-file %orgfile% --log-results %logs% --verbose --workers %workers%
3958

40-
REM Help text
41-
if "%~1"=="/?" goto help
59+
REM Calculate duration
60+
set "stop=%time%"
61+
set /a "secs=1%stop:~6,2% - 1%start:~6,2% + (1%stop:~3,2% - 1%start:~3,2%) * 60 + (1%stop:~0,2% - 1%start:~0,2%) * 60"
62+
echo.
63+
echo ***
64+
echo *** END: Conversion completed
65+
echo *** Duration: %secs% seconds
66+
echo *** Finished at: %date% %time%
67+
echo ***
4268
exit /b
4369

4470
:help
71+
echo GHArchive GZ to Parquet Conversion Script
72+
echo.
4573
echo Usage:
46-
echo %0 source target orgfile [logs] [pylogs] [workers]
47-
echo Required:
48-
echo source: Path to source data
49-
echo target: Path for output data
50-
echo orgfile: Path to org list file
51-
echo Optional:
52-
echo logs: Path to save results log CSV (default is gharchive-gz-hour2day-source-target.csv)
53-
echo pylogs: Path to save python output log (default is gharchive-gz-hour2day-pylog-source-target.log)
54-
echo workers: Number of workers (default is 55)
74+
echo %~nx0 source target orgfile [logs] [pylogs] [workers]
75+
echo.
76+
echo Required arguments:
77+
echo source Path to source GZ data files
78+
echo target Path for output Parquet files
79+
echo orgfile Path to organization list file
80+
echo.
81+
echo Optional arguments:
82+
echo logs Path to save results log CSV
83+
echo (default: target\gharchive-gz-hour2day-YYYYMMDD.csv)
84+
echo pylogs Path to save Python output log
85+
echo (default: target\gharchive-gz-hour2day-pylog-YYYYMMDD.log)
86+
echo workers Number of parallel workers (default: 55)
87+
echo.
88+
echo Example:
89+
echo %~nx0 "p:\gha-raw-daily\2024" "p:\gha-parquet-daily\2024" "C:\data\gharchive\orgs.csv"
90+
91+
REM exit /b 1 returns error code 1 to calling process to indicate help was shown
92+
REM due to missing required arguments or explicit help request
93+
exit /b 1

db/sgm-gharchive/gharchive-gz-to-parquet.py

-5
Original file line numberDiff line numberDiff line change
@@ -1366,12 +1366,10 @@ def parse_args():
13661366
def main():
13671367
global g_pool, g_manager
13681368

1369-
ic()
13701369
print(f"Current Working Directory: {os.getcwd()}")
13711370
print(f"Script Path: {os.path.abspath(__file__)}")
13721371

13731372
# Set working directory to the location the script was called from
1374-
ic((f'cwd: {os.getcwd()}'))
13751373
os.chdir(os.path.dirname(os.path.abspath(__file__)))
13761374
ic((f'cwd: {os.getcwd()}'))
13771375

@@ -1410,9 +1408,6 @@ def main():
14101408
print('Starting PoolManager')
14111409
ic(context)
14121410
try:
1413-
#
1414-
# "DoAllTheWork()"
1415-
#
14161411
ret = process_days(context, days)
14171412

14181413
except KeyboardInterrupt:

0 commit comments

Comments
 (0)