|
1 | 1 | @echo off
|
2 | 2 |
|
3 | 3 | echo ***
|
4 |
| -echo *** BEGIN |
| 4 | +echo *** BEGIN: GHArchive GZ to Parquet Conversion |
| 5 | +echo *** Started at: %date% %time% |
5 | 6 | echo ***
|
6 | 7 |
|
7 | 8 | set "start=%time%"
|
8 | 9 |
|
9 |
| -REM Get command line arguments |
| 10 | +REM Validate required arguments |
| 11 | +if "%~1"=="" goto help |
| 12 | +if "%~2"=="" goto help |
| 13 | +if "%~3"=="" goto help |
| 14 | + |
| 15 | +REM Get command line arguments |
10 | 16 | set "source=%~1"
|
11 | 17 | set "target=%~2"
|
12 | 18 | set "orgfile=%~3"
|
13 |
| -set "logs=%~4" |
14 |
| -set "pylogs=%~5" |
15 |
| -set "workers=%~6" |
16 | 19 |
|
17 |
| -if "%logs%"=="" ( |
18 |
| - set "logs=%target%\gharchive-gz-hour2day-%source:~0,8%.csv" |
| 20 | +REM Set optional arguments with defaults |
| 21 | +if "%~4"=="" ( |
| 22 | + for %%I in ("%source%") do set "sourcename=%%~nxI" |
| 23 | + set "logs=%target%\gharchive-gz-hour2day-%sourcename%.csv" |
| 24 | +) else ( |
| 25 | + set "logs=%~4" |
19 | 26 | )
|
20 | 27 |
|
21 |
| -if "%pylogs%"=="" ( |
22 |
| - set "pylogs=%target%\gharchive-gz-hour2day-pylog-%source:~0,8%.log" |
| 28 | +if "%~5"=="" ( |
| 29 | + for %%I in ("%source%") do set "sourcename=%%~nxI" |
| 30 | + set "pylogs=%target%\gharchive-gz-to-parquet-pylog-%sourcename%.log" |
| 31 | +) else ( |
| 32 | + set "pylogs=%~5" |
23 | 33 | )
|
24 | 34 |
|
25 |
| -if "%workers%"=="" ( |
26 |
| - set workers=55 |
| 35 | +if "%~6"=="" ( |
| 36 | + set workers=55 |
| 37 | +) else ( |
| 38 | + set "workers=%~6" |
27 | 39 | )
|
28 | 40 |
|
29 |
| -python gharchive-gz-hour2day.py --source %source% --target %target% --org-file %orgfile% --log-results %logs% --verbose --workers %workers% > %pylogs% 2>&1 |
| 41 | +REM Run conversion |
| 42 | +echo Running conversion with: |
| 43 | +echo Source: %source% |
| 44 | +echo Target: %target% |
| 45 | +echo Org File: %orgfile% |
| 46 | +echo Logs: %logs% |
| 47 | +echo Python Logs: %pylogs% |
| 48 | +echo Workers: %workers% |
| 49 | +echo. |
30 | 50 |
|
31 |
| -set "stop=%time%" |
32 |
| -set /a "secs=1%stop:~6,2% - 1%start:~6,2% + (1%stop:~3,2% - 1%start:~3,2%) * 60 + (1%stop:~0,2% - 1%start:~0,2%) / 60" |
33 |
| -echo Duration: %secs% seconds |
| 51 | +if not exist "%target%" ( |
| 52 | + echo Creating target directory: %target% |
| 53 | + mkdir "%target%" |
| 54 | +) |
34 | 55 |
|
35 |
| -@echo off |
36 |
| -echo *** |
37 |
| -echo *** END |
38 |
| -echo *** |
| 56 | +REM Remove the redirection to allow console output while still logging |
| 57 | +python gharchive-gz-to-parquet.py --source %source% --target %target% --org-file %orgfile% --log-results %logs% --verbose --workers %workers% |
39 | 58 |
|
40 |
| -REM Help text |
41 |
| -if "%~1"=="/?" goto help |
| 59 | +REM Calculate duration |
| 60 | +set "stop=%time%" |
| 61 | +set /a "secs=1%stop:~6,2% - 1%start:~6,2% + (1%stop:~3,2% - 1%start:~3,2%) * 60 + (1%stop:~0,2% - 1%start:~0,2%) * 60" |
| 62 | +echo. |
| 63 | +echo *** |
| 64 | +echo *** END: Conversion completed |
| 65 | +echo *** Duration: %secs% seconds |
| 66 | +echo *** Finished at: %date% %time% |
| 67 | +echo *** |
42 | 68 | exit /b
|
43 | 69 |
|
44 | 70 | :help
|
| 71 | +echo GHArchive GZ to Parquet Conversion Script |
| 72 | +echo. |
45 | 73 | echo Usage:
|
46 |
| -echo %0 source target orgfile [logs] [pylogs] [workers] |
47 |
| -echo Required: |
48 |
| -echo source: Path to source data |
49 |
| -echo target: Path for output data |
50 |
| -echo orgfile: Path to org list file |
51 |
| -echo Optional: |
52 |
| -echo logs: Path to save results log CSV (default is gharchive-gz-hour2day-source-target.csv) |
53 |
| -echo pylogs: Path to save python output log (default is gharchive-gz-hour2day-pylog-source-target.log) |
54 |
| -echo workers: Number of workers (default is 55) |
| 74 | +echo %~nx0 source target orgfile [logs] [pylogs] [workers] |
| 75 | +echo. |
| 76 | +echo Required arguments: |
| 77 | +echo source Path to source GZ data files |
| 78 | +echo target Path for output Parquet files |
| 79 | +echo orgfile Path to organization list file |
| 80 | +echo. |
| 81 | +echo Optional arguments: |
| 82 | +echo logs Path to save results log CSV |
| 83 | +echo (default: target\gharchive-gz-hour2day-YYYYMMDD.csv) |
| 84 | +echo pylogs Path to save Python output log |
| 85 | +echo (default: target\gharchive-gz-hour2day-pylog-YYYYMMDD.log) |
| 86 | +echo workers Number of parallel workers (default: 55) |
| 87 | +echo. |
| 88 | +echo Example: |
| 89 | +echo %~nx0 "p:\gha-raw-daily\2024" "p:\gha-parquet-daily\2024" "C:\data\gharchive\orgs.csv" |
| 90 | + |
| 91 | +REM exit /b 1 returns error code 1 to calling process to indicate help was shown |
| 92 | +REM due to missing required arguments or explicit help request |
| 93 | +exit /b 1 |
0 commit comments