Skip to content

Commit

Permalink
Merge pull request #1 from raphaelmansuy/feat/strip-comments
Browse files Browse the repository at this point in the history
  • Loading branch information
raphaelmansuy committed Mar 24, 2024
2 parents b0eb888 + 1206b3c commit 6368077
Show file tree
Hide file tree
Showing 7 changed files with 440 additions and 2 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ With Code2Prompt, you can easily create a well-structured and informative docume
- Respects .gitignore files to exclude unwanted files and directories
- Generates a table of contents with links to each file section
- Provides file metadata such as extension, size, creation time, and modification time
- Optionally strips comments from code files to focus on the core code
- Includes the actual code content of each file in fenced code blocks
- Handles binary files and files with encoding issues gracefully

Expand Down Expand Up @@ -102,6 +103,7 @@ code2prompt --path /path/to/your/codebase --output output.md
- `--output` (optional): Name of the output Markdown file. If not provided, the output will be displayed in the console.
- `--gitignore` (optional): Path to a custom .gitignore file. If not provided, the tool will look for a .gitignore file in the specified directory.
- `--filter` (optional): Filter pattern to include specific files (e.g., "*.py" to include only Python files).
- `--suppress-comments` (optional): Strip comments from the code files. If not provided, comments will be included.

### Examples

Expand All @@ -120,6 +122,11 @@ code2prompt --path /path/to/your/codebase --output output.md
code2prompt --path /path/to/your/project --output project.md --gitignore /path/to/custom/.gitignore
```

4. Generate a Markdown file with comments stripped from code files:
```
code2prompt --path /path/to/your/project --output project.md --suppress-comments
```

## Build

To build a distributable package of Code2Prompt using Poetry, follow these steps:
Expand Down
180 changes: 180 additions & 0 deletions code2prompt/comment_stripper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
""" A collection of functions to strip comments from code strings based on the programming language. """

import re


def strip_c_style_comments(code: str) -> str:
"""
Strips C-style comments from the given code string.
Supports single-line comments (//), multi-line comments (/* */), and string literals.
:param code: The code string to strip comments from.
:return: The code string with C-style comments removed.
"""
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE,
)
return re.sub(
pattern,
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
code,
)


def strip_html_style_comments(code: str) -> str:
"""
Strips HTML-style comments from the given code string.
Supports both single-line and multi-line comments (<!-- -->).
:param code: The code string to strip comments from.
:return: The code string with HTML-style comments removed.
"""
pattern = re.compile(r"<!--.*?-->", re.DOTALL)
return re.sub(pattern, "", code)


def strip_python_style_comments(code: str) -> str:
"""
Strips Python-style comments from the given code string.
Supports single-line comments (#), multi-line comments (''' ''' or \"\"\" \"\"\"), and string literals.
:param code: The code string to strip comments from.
:return: The code string with Python-style comments removed.
"""
pattern = re.compile(
r'(?s)#.*?$|\'\'\'.*?\'\'\'|""".*?"""|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.MULTILINE,
)
return re.sub(
pattern,
lambda match: (
"" if match.group(0).startswith(("#", "'''", '"""')) else match.group(0)
),
code,
)


def strip_shell_style_comments(code: str) -> str:
"""
Strips shell-style comments from the given code string.
Supports single-line comments (#) and multi-line comments (: ' ').
:param code: The code string to strip comments from.
:return: The code string with shell-style comments removed.
"""
lines = code.split("\n")
new_lines = []
in_multiline_comment = False

for line in lines:
if line.strip().startswith("#!"):
# Preserve shebang lines
new_lines.append(line)
elif in_multiline_comment:
if line.strip().endswith("'"):
in_multiline_comment = False
elif line.strip().startswith(": '"):
in_multiline_comment = True
elif "#" in line:
# Remove single-line comments
line = line.split("#", 1)[0]
if line.strip():
new_lines.append(line)
else:
new_lines.append(line)

return "\n".join(new_lines).strip()

def strip_sql_style_comments(code: str) -> str:
"""
Strips SQL-style comments from the given code string.
Supports single-line comments (--), multi-line comments (/* */), and string literals.
:param code: The code string to strip comments from.
:return: The code string with SQL-style comments removed.
"""
pattern = re.compile(
r'--.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE,
)
return re.sub(
pattern,
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
code,
)


def strip_matlab_style_comments(code: str) -> str:
"""
Strips MATLAB-style comments from the given code string.
Supports single-line comments (%) and string literals.
:param code: The code string to strip comments from.
:return: The code string with MATLAB-style comments removed.
"""
pattern = re.compile(
r'%.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE
)
return re.sub(
pattern,
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
code,
)


def strip_r_style_comments(code: str) -> str:
"""
Strips R-style comments from the given code string.
Supports single-line comments (#) and string literals.
:param code: The code string to strip comments from.
:return: The code string with R-style comments removed.
"""
pattern = re.compile(
r'#.*?$|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', re.DOTALL | re.MULTILINE
)
return re.sub(
pattern,
lambda match: match.group(0) if match.group(0).startswith(("'", '"')) else "",
code,
)


def strip_comments(code: str, language: str) -> str:
"""
Strips comments from the given code string based on the specified programming language.
:param code: The code string to strip comments from.
:param language: The programming language of the code.
:return: The code string with comments removed.
"""
if language in [
"c",
"cpp",
"java",
"javascript",
"csharp",
"php",
"go",
"rust",
"kotlin",
"swift",
"scala",
"dart",
]:
return strip_c_style_comments(code)
elif language in ["python", "ruby", "perl"]:
return strip_python_style_comments(code)
elif language in ["bash", "powershell", "shell"]:
return strip_shell_style_comments(code)
elif language in ["html", "xml"]:
return strip_html_style_comments(code)
elif language in ["sql", "plsql", "tsql"]:
return strip_sql_style_comments(code)
elif language in ["matlab", "octave"]:
return strip_matlab_style_comments(code)
elif language in ["r"]:
return strip_r_style_comments(code)
else:
return code
61 changes: 61 additions & 0 deletions code2prompt/language_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
""" This module contains the function to infer the programming language based on the file extension. """

import os


def infer_language(filename: str) -> str:
"""
Infers the programming language based on the file extension.
:param filename: The name of the file.
:return: The inferred programming language.
"""
_, extension = os.path.splitext(filename)
extension = extension.lower()

if extension in [".c", ".h"]:
return "c"
elif extension in [".cpp", ".hpp", ".cc", ".cxx"]:
return "cpp"
elif extension in [".java"]:
return "java"
elif extension in [".js", ".jsx"]:
return "javascript"
elif extension in [".cs"]:
return "csharp"
elif extension in [".php"]:
return "php"
elif extension in [".go"]:
return "go"
elif extension in [".rs"]:
return "rust"
elif extension in [".kt"]:
return "kotlin"
elif extension in [".swift"]:
return "swift"
elif extension in [".scala"]:
return "scala"
elif extension in [".dart"]:
return "dart"
elif extension in [".py"]:
return "python"
elif extension in [".rb"]:
return "ruby"
elif extension in [".pl", ".pm"]:
return "perl"
elif extension in [".sh"]:
return "bash"
elif extension in [".ps1"]:
return "powershell"
elif extension in [".html", ".htm"]:
return "html"
elif extension in [".xml"]:
return "xml"
elif extension in [".sql"]:
return "sql"
elif extension in [".m"]:
return "matlab"
elif extension in [".r"]:
return "r"
else:
return "unknown"
18 changes: 17 additions & 1 deletion code2prompt/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
""" Main module for the code2prompt package. """

from datetime import datetime
from pathlib import Path
from fnmatch import fnmatch
import click

from code2prompt.language_inference import infer_language
from code2prompt.comment_stripper import strip_comments


def parse_gitignore(gitignore_path):
"""Parse the .gitignore file and return a set of patterns."""
Expand Down Expand Up @@ -98,7 +102,14 @@ def is_binary(file_path):
@click.option(
"--filter", "-f", type=str, help='Filter pattern to include files (e.g., "*.py").'
)
def create_markdown_file(path, output, gitignore, filter):
@click.option(
"--suppress-comments",
"-s",
is_flag=True,
help="Strip comments from the code files.",
default=False,
)
def create_markdown_file(path, output, gitignore, filter, suppress_comments):
"""Create a Markdown file with the content of files in a directory."""
content = []
table_of_contents = []
Expand Down Expand Up @@ -127,6 +138,10 @@ def create_markdown_file(path, output, gitignore, filter):
try:
with file_path.open("r", encoding="utf-8") as f:
file_content = f.read()
if suppress_comments:
language = infer_language(file_path.name)
if language != "unknown":
file_content = strip_comments(file_content, language)
except UnicodeDecodeError:
# Ignore files that cannot be decoded
continue
Expand Down Expand Up @@ -158,4 +173,5 @@ def create_markdown_file(path, output, gitignore, filter):


if __name__ == "__main__":
# pylint: disable=no-value-for-parameter
create_markdown_file()
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "code2prompt"
version = "0.1.3"
version = "0.2.0"
description = ""
authors = ["Raphael MANSUY <[email protected]>"]
readme = "README.md"
Expand Down
Loading

0 comments on commit 6368077

Please sign in to comment.