-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from raphaelmansuy/feat/strip-comments
- Loading branch information
Showing
7 changed files
with
440 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
""" A collection of functions to strip comments from code strings based on the programming language. """ | ||
|
||
import re | ||
|
||
|
||
def strip_c_style_comments(code: str) -> str: | ||
""" | ||
Strips C-style comments from the given code string. | ||
Supports single-line comments (//), multi-line comments (/* */), and string literals. | ||
:param code: The code string to strip comments from. | ||
:return: The code string with C-style comments removed. | ||
""" | ||
pattern = re.compile( | ||
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', | ||
re.DOTALL | re.MULTILINE, | ||
) | ||
return re.sub( | ||
pattern, | ||
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", | ||
code, | ||
) | ||
|
||
|
||
def strip_html_style_comments(code: str) -> str: | ||
""" | ||
Strips HTML-style comments from the given code string. | ||
Supports both single-line and multi-line comments (<!-- -->). | ||
:param code: The code string to strip comments from. | ||
:return: The code string with HTML-style comments removed. | ||
""" | ||
pattern = re.compile(r"<!--.*?-->", re.DOTALL) | ||
return re.sub(pattern, "", code) | ||
|
||
|
||
def strip_python_style_comments(code: str) -> str: | ||
""" | ||
Strips Python-style comments from the given code string. | ||
Supports single-line comments (#), multi-line comments (''' ''' or \"\"\" \"\"\"), and string literals. | ||
:param code: The code string to strip comments from. | ||
:return: The code string with Python-style comments removed. | ||
""" | ||
pattern = re.compile( | ||
r'(?s)#.*?$|\'\'\'.*?\'\'\'|""".*?"""|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', | ||
re.MULTILINE, | ||
) | ||
return re.sub( | ||
pattern, | ||
lambda match: ( | ||
"" if match.group(0).startswith(("#", "'''", '"""')) else match.group(0) | ||
), | ||
code, | ||
) | ||
|
||
|
||
def strip_shell_style_comments(code: str) -> str: | ||
""" | ||
Strips shell-style comments from the given code string. | ||
Supports single-line comments (#) and multi-line comments (: ' '). | ||
:param code: The code string to strip comments from. | ||
:return: The code string with shell-style comments removed. | ||
""" | ||
lines = code.split("\n") | ||
new_lines = [] | ||
in_multiline_comment = False | ||
|
||
for line in lines: | ||
if line.strip().startswith("#!"): | ||
# Preserve shebang lines | ||
new_lines.append(line) | ||
elif in_multiline_comment: | ||
if line.strip().endswith("'"): | ||
in_multiline_comment = False | ||
elif line.strip().startswith(": '"): | ||
in_multiline_comment = True | ||
elif "#" in line: | ||
# Remove single-line comments | ||
line = line.split("#", 1)[0] | ||
if line.strip(): | ||
new_lines.append(line) | ||
else: | ||
new_lines.append(line) | ||
|
||
return "\n".join(new_lines).strip() | ||
|
||
def strip_sql_style_comments(code: str) -> str: | ||
""" | ||
Strips SQL-style comments from the given code string. | ||
Supports single-line comments (--), multi-line comments (/* */), and string literals. | ||
:param code: The code string to strip comments from. | ||
:return: The code string with SQL-style comments removed. | ||
""" | ||
pattern = re.compile( | ||
r'--.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', | ||
re.DOTALL | re.MULTILINE, | ||
) | ||
return re.sub( | ||
pattern, | ||
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", | ||
code, | ||
) | ||
|
||
|
||
def strip_matlab_style_comments(code: str) -> str: | ||
""" | ||
Strips MATLAB-style comments from the given code string. | ||
Supports single-line comments (%) and string literals. | ||
:param code: The code string to strip comments from. | ||
:return: The code string with MATLAB-style comments removed. | ||
""" | ||
pattern = re.compile( | ||
r'%.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE | ||
) | ||
return re.sub( | ||
pattern, | ||
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", | ||
code, | ||
) | ||
|
||
|
||
def strip_r_style_comments(code: str) -> str: | ||
""" | ||
Strips R-style comments from the given code string. | ||
Supports single-line comments (#) and string literals. | ||
:param code: The code string to strip comments from. | ||
:return: The code string with R-style comments removed. | ||
""" | ||
pattern = re.compile( | ||
r'#.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE | ||
) | ||
return re.sub( | ||
pattern, | ||
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "", | ||
code, | ||
) | ||
|
||
|
||
def strip_comments(code: str, language: str) -> str: | ||
""" | ||
Strips comments from the given code string based on the specified programming language. | ||
:param code: The code string to strip comments from. | ||
:param language: The programming language of the code. | ||
:return: The code string with comments removed. | ||
""" | ||
if language in [ | ||
"c", | ||
"cpp", | ||
"java", | ||
"javascript", | ||
"csharp", | ||
"php", | ||
"go", | ||
"rust", | ||
"kotlin", | ||
"swift", | ||
"scala", | ||
"dart", | ||
]: | ||
return strip_c_style_comments(code) | ||
elif language in ["python", "ruby", "perl"]: | ||
return strip_python_style_comments(code) | ||
elif language in ["bash", "powershell", "shell"]: | ||
return strip_shell_style_comments(code) | ||
elif language in ["html", "xml"]: | ||
return strip_html_style_comments(code) | ||
elif language in ["sql", "plsql", "tsql"]: | ||
return strip_sql_style_comments(code) | ||
elif language in ["matlab", "octave"]: | ||
return strip_matlab_style_comments(code) | ||
elif language in ["r"]: | ||
return strip_r_style_comments(code) | ||
else: | ||
return code |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
""" This module contains the function to infer the programming language based on the file extension. """ | ||
|
||
import os | ||
|
||
|
||
def infer_language(filename: str) -> str: | ||
""" | ||
Infers the programming language based on the file extension. | ||
:param filename: The name of the file. | ||
:return: The inferred programming language. | ||
""" | ||
_, extension = os.path.splitext(filename) | ||
extension = extension.lower() | ||
|
||
if extension in [".c", ".h"]: | ||
return "c" | ||
elif extension in [".cpp", ".hpp", ".cc", ".cxx"]: | ||
return "cpp" | ||
elif extension in [".java"]: | ||
return "java" | ||
elif extension in [".js", ".jsx"]: | ||
return "javascript" | ||
elif extension in [".cs"]: | ||
return "csharp" | ||
elif extension in [".php"]: | ||
return "php" | ||
elif extension in [".go"]: | ||
return "go" | ||
elif extension in [".rs"]: | ||
return "rust" | ||
elif extension in [".kt"]: | ||
return "kotlin" | ||
elif extension in [".swift"]: | ||
return "swift" | ||
elif extension in [".scala"]: | ||
return "scala" | ||
elif extension in [".dart"]: | ||
return "dart" | ||
elif extension in [".py"]: | ||
return "python" | ||
elif extension in [".rb"]: | ||
return "ruby" | ||
elif extension in [".pl", ".pm"]: | ||
return "perl" | ||
elif extension in [".sh"]: | ||
return "bash" | ||
elif extension in [".ps1"]: | ||
return "powershell" | ||
elif extension in [".html", ".htm"]: | ||
return "html" | ||
elif extension in [".xml"]: | ||
return "xml" | ||
elif extension in [".sql"]: | ||
return "sql" | ||
elif extension in [".m"]: | ||
return "matlab" | ||
elif extension in [".r"]: | ||
return "r" | ||
else: | ||
return "unknown" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "code2prompt" | ||
version = "0.1.3" | ||
version = "0.2.0" | ||
description = "" | ||
authors = ["Raphael MANSUY <[email protected]>"] | ||
readme = "README.md" | ||
|
Oops, something went wrong.