Skip to content

Commit

Permalink
fix: remove redundant code
Browse files Browse the repository at this point in the history
  • Loading branch information
andyhuang18 committed Dec 5, 2024
1 parent a046607 commit bc18fac
Show file tree
Hide file tree
Showing 3 changed files with 0 additions and 294 deletions.
98 changes: 0 additions & 98 deletions dashboard/company/scripts/workflow_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,104 +235,6 @@ def fetch_and_upload_mergedPRcount_to_clickhouse(csv_file_path):

print("CSV 数据已成功上传到目标 ClickHouse 数据库")

# 获取最近6个月的日期

def get_last_six_months():
today = datetime.today()
return [(today - timedelta(days=30 * i)).strftime('%Y-%m') for i in range(6)][::-1]

last_six_months = get_last_six_months()

# API请求的基础URL模板
activity_url_template = "https://oss.x-lab.info/open_digger/github/{}/activity.json"
openrank_url_template = "https://oss.x-lab.info/open_digger/github/{}/openrank.json"

# 数据存储
data = []

# 从URL获取数据并过滤最近六个月的数据
def fetch_and_filter(url, months):
try:
response = requests.get(url)
response.raise_for_status()
json_data = response.json()
return {month: json_data.get(month, None) for month in months}
except requests.RequestException as e:
print(f"请求失败 {url}: {e}")
return {month: None for month in months}

# 遍历每个社区和仓库
for community, repos in data.items():
for repo_name in repos:
print(f"处理仓库: {repo_name}")

# 获取数据
activity_url = activity_url_template.format(repo_name)
openrank_url = openrank_url_template.format(repo_name)

activity_data = fetch_and_filter(activity_url, last_six_months)
openrank_data = fetch_and_filter(openrank_url, last_six_months)

# 将数据存储到目标格式
for month in last_six_months:
data.append({
"community": community,
"repo_name": repo_name,
"month": month,
"activity": activity_data.get(month),
"openrank": openrank_data.get(month)
})

# 转换为DataFrame
df = pd.DataFrame(data)

# 删除 activity 和 openrank 列中有空值的行
df_cleaned = df.dropna(subset=['activity', 'openrank'])

# 保存清洗后的结果到 CSV 文件
df_cleaned.to_csv(csv_file_path, index=False)
print(f"数据已保存到 {csv_file_path}")

# Step 2: 删除目标表并重新创建
try:
# 删除现有表
target_client.execute(
"DROP TABLE IF EXISTS community_repository_data_cleaned")
print("旧表已删除")
except Exception as e:
print(f"删除表时出错: {e}")

# 重新创建目标表
create_table_query = """
CREATE TABLE community_repository_data_cleaned
(
community String,
repo_name String,
month String,
activity String,
openrank String
) ENGINE = MergeTree()
ORDER BY (community, repo_name, month);
"""
target_client.execute(create_table_query)
print("目标表已重新创建")

# Step 3: 读取清洗后的 CSV 文件并将数据插入到目标数据库
with open(csv_file_path, 'r') as csvfile:
csv_reader = csv.DictReader(csvfile)

# 将清洗后的 CSV 数据插入到目标 ClickHouse 表
for row in csv_reader:
# 构造插入的 SQL 语句
insert_query = """
INSERT INTO community_repository_data_cleaned (community, repo_name, month, activity, openrank)
VALUES
"""
values = f"('{row['community']}', '{row['repo_name']}', '{row['month']}', '{row['activity']}', '{row['openrank']}')"
target_client.execute(insert_query + values)

print("清洗后的 CSV 数据已成功上传到目标 ClickHouse 数据库")


# 配置源 ClickHouse 的连接参数
source_client = Client(
Expand Down
98 changes: 0 additions & 98 deletions dashboard/company/scripts/workflow_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,104 +255,6 @@ def fetch_and_upload_issueinfo_to_clickhouse(csv_file_path):

print("CSV 数据已成功上传到目标 ClickHouse 数据库")

# 获取最近6个月的日期

def get_last_six_months():
today = datetime.today()
return [(today - timedelta(days=30 * i)).strftime('%Y-%m') for i in range(6)][::-1]

last_six_months = get_last_six_months()

# API请求的基础URL模板
activity_url_template = "https://oss.x-lab.info/open_digger/github/{}/activity.json"
openrank_url_template = "https://oss.x-lab.info/open_digger/github/{}/openrank.json"

# 数据存储
data = []

# 从URL获取数据并过滤最近六个月的数据
def fetch_and_filter(url, months):
try:
response = requests.get(url)
response.raise_for_status()
json_data = response.json()
return {month: json_data.get(month, None) for month in months}
except requests.RequestException as e:
print(f"请求失败 {url}: {e}")
return {month: None for month in months}

# 遍历每个社区和仓库
for community, repos in data.items():
for repo_name in repos:
print(f"处理仓库: {repo_name}")

# 获取数据
activity_url = activity_url_template.format(repo_name)
openrank_url = openrank_url_template.format(repo_name)

activity_data = fetch_and_filter(activity_url, last_six_months)
openrank_data = fetch_and_filter(openrank_url, last_six_months)

# 将数据存储到目标格式
for month in last_six_months:
data.append({
"community": community,
"repo_name": repo_name,
"month": month,
"activity": activity_data.get(month),
"openrank": openrank_data.get(month)
})

# 转换为DataFrame
df = pd.DataFrame(data)

# 删除 activity 和 openrank 列中有空值的行
df_cleaned = df.dropna(subset=['activity', 'openrank'])

# 保存清洗后的结果到 CSV 文件
df_cleaned.to_csv(csv_file_path, index=False)
print(f"数据已保存到 {csv_file_path}")

# Step 2: 删除目标表并重新创建
try:
# 删除现有表
target_client.execute(
"DROP TABLE IF EXISTS community_repository_data_cleaned")
print("旧表已删除")
except Exception as e:
print(f"删除表时出错: {e}")

# 重新创建目标表
create_table_query = """
CREATE TABLE community_repository_data_cleaned
(
community String,
repo_name String,
month String,
activity String,
openrank String
) ENGINE = MergeTree()
ORDER BY (community, repo_name, month);
"""
target_client.execute(create_table_query)
print("目标表已重新创建")

# Step 3: 读取清洗后的 CSV 文件并将数据插入到目标数据库
with open(csv_file_path, 'r') as csvfile:
csv_reader = csv.DictReader(csvfile)

# 将清洗后的 CSV 数据插入到目标 ClickHouse 表
for row in csv_reader:
# 构造插入的 SQL 语句
insert_query = """
INSERT INTO community_repository_data_cleaned (community, repo_name, month, activity, openrank)
VALUES
"""
values = f"('{row['community']}', '{row['repo_name']}', '{row['month']}', '{row['activity']}', '{row['openrank']}')"
target_client.execute(insert_query + values)

print("清洗后的 CSV 数据已成功上传到目标 ClickHouse 数据库")


# 配置源 ClickHouse 的连接参数
source_client = Client(
Expand Down
98 changes: 0 additions & 98 deletions dashboard/company/scripts/workflow_3.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,104 +262,6 @@ def fetch_and_upload_contributorinfo_to_clickhouse(csv_file_path):

print("CSV 数据已成功上传到目标 ClickHouse 数据库")

# 获取最近6个月的日期

def get_last_six_months():
today = datetime.today()
return [(today - timedelta(days=30 * i)).strftime('%Y-%m') for i in range(6)][::-1]

last_six_months = get_last_six_months()

# API请求的基础URL模板
activity_url_template = "https://oss.x-lab.info/open_digger/github/{}/activity.json"
openrank_url_template = "https://oss.x-lab.info/open_digger/github/{}/openrank.json"

# 数据存储
data = []

# 从URL获取数据并过滤最近六个月的数据
def fetch_and_filter(url, months):
try:
response = requests.get(url)
response.raise_for_status()
json_data = response.json()
return {month: json_data.get(month, None) for month in months}
except requests.RequestException as e:
print(f"请求失败 {url}: {e}")
return {month: None for month in months}

# 遍历每个社区和仓库
for community, repos in data.items():
for repo_name in repos:
print(f"处理仓库: {repo_name}")

# 获取数据
activity_url = activity_url_template.format(repo_name)
openrank_url = openrank_url_template.format(repo_name)

activity_data = fetch_and_filter(activity_url, last_six_months)
openrank_data = fetch_and_filter(openrank_url, last_six_months)

# 将数据存储到目标格式
for month in last_six_months:
data.append({
"community": community,
"repo_name": repo_name,
"month": month,
"activity": activity_data.get(month),
"openrank": openrank_data.get(month)
})

# 转换为DataFrame
df = pd.DataFrame(data)

# 删除 activity 和 openrank 列中有空值的行
df_cleaned = df.dropna(subset=['activity', 'openrank'])

# 保存清洗后的结果到 CSV 文件
df_cleaned.to_csv(csv_file_path, index=False)
print(f"数据已保存到 {csv_file_path}")

# Step 2: 删除目标表并重新创建
try:
# 删除现有表
target_client.execute(
"DROP TABLE IF EXISTS community_repository_data_cleaned")
print("旧表已删除")
except Exception as e:
print(f"删除表时出错: {e}")

# 重新创建目标表
create_table_query = """
CREATE TABLE community_repository_data_cleaned
(
community String,
repo_name String,
month String,
activity String,
openrank String
) ENGINE = MergeTree()
ORDER BY (community, repo_name, month);
"""
target_client.execute(create_table_query)
print("目标表已重新创建")

# Step 3: 读取清洗后的 CSV 文件并将数据插入到目标数据库
with open(csv_file_path, 'r') as csvfile:
csv_reader = csv.DictReader(csvfile)

# 将清洗后的 CSV 数据插入到目标 ClickHouse 表
for row in csv_reader:
# 构造插入的 SQL 语句
insert_query = """
INSERT INTO community_repository_data_cleaned (community, repo_name, month, activity, openrank)
VALUES
"""
values = f"('{row['community']}', '{row['repo_name']}', '{row['month']}', '{row['activity']}', '{row['openrank']}')"
target_client.execute(insert_query + values)

print("清洗后的 CSV 数据已成功上传到目标 ClickHouse 数据库")


# 配置源 ClickHouse 的连接参数
source_client = Client(
Expand Down

0 comments on commit bc18fac

Please sign in to comment.