Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fixes for datapythonista's review
  • Loading branch information
MomIsBestFriend committed Jan 1, 2020
commit c19a2a5bc55697a2e9d79b51cb8492b6cbc7a958
6 changes: 5 additions & 1 deletion ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,11 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Check for use of not concatenated strings' ; echo $MSG
$BASE_DIR/scripts/validate_string_concatenation.py pandas
if [[ "$GITHUB_ACTIONS" == "true" ]]; then
$BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:String unnecessarily split in two by black. Please merge them manually." .
else
$BASE_DIR/scripts/validate_string_concatenation.py .
fi
RET=$(($RET + $?)) ; echo $MSG "DONE"

echo "isort --version-number"
Expand Down
116 changes: 76 additions & 40 deletions scripts/validate_string_concatenation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
#!/usr/bin/env python
"""
GH #30454

Expand All @@ -12,7 +12,6 @@
... "baz"
... )


into this:

>>> foo = ("bar " "baz")
Expand All @@ -22,65 +21,102 @@
so we are checking it here.
"""

import argparse
import os
import sys
import token
import tokenize
from typing import FrozenSet, Generator, List
from typing import Dict, Generator, List

FILE_EXTENSIONS_TO_CHECK = (".py", ".pyx", ".pyx.ini", ".pxd")

FILE_EXTENSIONS_TO_CHECK: FrozenSet[str] = frozenset(
(".pxd", ".py", ".pyx", ".pyx.ini")
)

def main(source_path: str, output_format: str) -> bool:
"""
Main entry point of the script.

Parameters
----------
source_path : str, default '.'
Source path representing path to a file/directory.
output_format : str
Output format of the script.

Returns
-------
bool
True if found any strings that needs to be concatenated.

def strings_to_concatenate(file_path: str) -> Generator[str, None, None]:
Raises
------
ValueError
If the `source_path` is not pointing to existing file/directory.
"""
if not os.path.exists(source_path):
raise ValueError(
"Please enter a valid path, pointing to a valid file/directory."
)

is_failed: bool = False

if os.path.isfile(source_path):
for values in strings_to_concatenate(source_path):
is_failed = True
print(output_format.format(**values))

for subdir, _, files in os.walk(source_path):
for file_name in files:
if any(
file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK
):
for values in strings_to_concatenate(os.path.join(subdir, file_name)):
is_failed = True
print(output_format.format(**values))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The output of this will be something like:

pandas/__init__.py:124: BETWEEN first part of the string AND with a second part in another string 

I don't think people will understand what's going on when seeing this error in the CI. I think we should display something like:

pandas/__init__.py:124:String unnecessarily split in two by black. Please merge them manually. 
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about this, will the message be too long?
and where do I put the source_path, line_number, start, end?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The message being long is not a problem. Neither for the CI or for the code.

From my example, the format would be:

{source_path}:{line_number}:String unnecessarily split in two by black. Please merge them manually. 

As I mentioned eariler, I don't find start and end particularly useful for the reader of the error. Feel free to leave them if you disagree, but better rename them, their name is misleading. string1 and string2 are not great names, but they're better. At least they don't create the wrong impression those are the positions of the strings.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can now see what you mean, Fixed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, looks much better.

Sorry I said it wrong, but the format should actually be {source_path}:{line_number}:{msg}. We'll decide the error here. Imagine that this function can detect more than one error, we can't say in the format which is the error we want to receive.

So this could become:

Suggested change
print(output_format.format(**values))
msg = "String unnecessarily split in two by black. Please merge them manually."
print(output_format.format(source_path=source_path, line_number=line_number, msg=msg))

And since now the strings_to_concatenate function is just returning two values, I think it makes more sense to return a tuple with them, instead of a dict, so the for above would be:

for source_path, line_number in strings_to_concatenate(os.path.join(subdir, file_name)): 
return is_failed


def strings_to_concatenate(source_path: str) -> Generator[Dict[str, str], None, None]:
"""
Yielding the strings that needs to be concatenated in a given file.

Parameters
----------
file_path : str
source_path : str
File path pointing to a single file.

Yields
------
str
Message containing info about the string that needs to be concatenated.
dict of {str: str}
Containing:
source_path
Source file path.
line_number
Line number of unconcatenated string.
"""
with open(file_path, "r") as file_name:
with open(source_path, "r") as file_name:
tokens: List = list(tokenize.generate_tokens(file_name.readline))

for current_token, next_token in zip(tokens, tokens[1:]):
if current_token[0] == next_token[0] == token.STRING:
line_number = current_token[2][0]
start = current_token[1]
end = next_token[1]
yield f"{file_path}:{line_number}:\t between {start} and {end}\n"
yield {"source_path": source_path, "line_number": current_token[2][0]}


if __name__ == "__main__":
path: str = sys.argv[1]

if not os.path.exists(path):
raise ValueError("Please enter a valid path, to a file/directory.")

failed: bool = False

if os.path.isfile(path):
for msg in strings_to_concatenate(path):
if msg:
failed = True
print(msg)

for subdir, _, files in os.walk(path):
for file_name in files:
if any(
file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK
):
file_extension = os.path.join(subdir, file_name)

for msg in strings_to_concatenate(os.path.join(subdir, file_name)):
if msg:
failed = True
print(msg)
sys.exit(failed)
parser = argparse.ArgumentParser(description="Validate concatenated strings")

parser.add_argument(
"path", nargs="?", default=".", help="Source path of file/directory to check."
)
parser.add_argument(
"--format",
"-f",
default=(
"{source_path}:{line_number}:String unnecessarily split in two by black. "
"Please merge them manually."
),
help="Output format of the unconcatenated strings.",
)

args = parser.parse_args()

sys.exit(main(source_path=args.path, output_format=args.format))