特别是理论上,WindowsXP 以后文件名应该是以 UFT-16LE 存在硬盘上的,可以覆盖所有Unicode 分区。可是 Python 2.7 里,通过命令行得到的文件名,只是当前 Codepage 编码的字符串。我们的目的是得到 Unicode 编码覆盖的所有码区的文件名。
输出也是这个问题,CP936 下 print()只能输出GBK码,不能输出所有 Unicode 范围的文本。
WinXP 控制台如此,Win7 也未必能好,倒是 Python3 应该是问题少些吧。
百度了一圈儿 Python2.7 怎么接受 console 下的繁简体,中日韩,英法俄语言共用,结果是无解。只好再搜英文网页,结果发现还真有料。记录在此,以供传播。
基本思想就是通过 ctypes 调用 windll 相关的宽字符 API,这样就可以直接得到或者输出 unicode 的结果了。
从命令行接受 unicode 编码的选项相对容易一些,使用 shell32.GetCommandLineArgvW 就差不多了,输出反而要复杂些,因为要替代 output stream,结果得弄一个全套的 io class 。
命令行读入 Unicode 选项: http://stackoverflow.com/a/846931
def win32_unicode_argv(): """Uses shell32.GetCommandLineArgvW to get sys.argv as a list of Unicode strings. Versions 2.x of Python don't support Unicode in sys.argv on Windows, with the underlying Windows API instead replacing multi-byte characters with '?'. """ from ctypes import POINTER, byref, cdll, c_int, windll from ctypes.wintypes import LPCWSTR, LPWSTR GetCommandLineW = cdll.kernel32.GetCommandLineW GetCommandLineW.argtypes = [] GetCommandLineW.restype = LPCWSTR CommandLineToArgvW = windll.shell32.CommandLineToArgvW CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] CommandLineToArgvW.restype = POINTER(LPWSTR) cmd = GetCommandLineW() argc = c_int(0) argv = CommandLineToArgvW(cmd, byref(argc)) if argc.value > 0 and not hasattr(sys, "frozen"): # Remove Python executable and commands if present and not frozen start = argc.value - len(sys.argv) return [argv[i] for i in xrange(start, argc.value)]命令行Unicode输出: http://stackoverflow.com/a/3259271
def win32_unicode_output(): import codecs from ctypes import WINFUNCTYPE, windll, POINTER, byref from ctypes.wintypes import BOOL, HANDLE, DWORD, LPWSTR, LPVOID original_stderr = sys.stderr # If any exception occurs in this code, we'll probably try to print it on # stderr, which makes for frustrating debugging if stderr is directed to # our wrapper. So be paranoid about catching errors and reporting them to # original_stderr, so that we can at least see them. def _complain(message): print(message if isinstance(message, str) else repr(message), file=original_stderr) # Work around <http://bugs.python.org/issue6058>. codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) # Make Unicode console output work independently of the current code page. # This also fixes <http://bugs.python.org/issue1602>. Credit to Michael # Kaplan <http://blogs.msdn.com/b/michkap/archive/2010/04/07/9989346.aspx> # and TZOmegaTZIOY # <http://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash/1432462#1432462>. try: # <http://msdn.microsoft.com/en-us/library/ms683231(VS.85).aspx> # HANDLE WINAPI GetStdHandle(DWORD nStdHandle); # returns INVALID_HANDLE_VALUE, NULL, or a valid handle # # <http://msdn.microsoft.com/en-us/library/aa364960(VS.85).aspx> # DWORD WINAPI GetFileType(DWORD hFile); # # <http://msdn.microsoft.com/en-us/library/ms683167(VS.85).aspx> # BOOL WINAPI GetConsoleMode(HANDLE hConsole, LPDWORD lpMode); GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(("GetStdHandle", windll.kernel32)) STD_OUTPUT_HANDLE = DWORD(-11) STD_ERROR_HANDLE = DWORD(-12) GetFileType = WINFUNCTYPE(DWORD, DWORD)(("GetFileType", windll.kernel32)) FILE_TYPE_CHAR = 0x0002 FILE_TYPE_REMOTE = 0x8000 GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(("GetConsoleMode", windll.kernel32)) INVALID_HANDLE_VALUE = DWORD(-1).value def not_a_console(handle): if handle == INVALID_HANDLE_VALUE or handle is None: return True return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or GetConsoleMode(handle, byref(DWORD())) == 0) old_stdout_fileno = None old_stderr_fileno = None if hasattr(sys.stdout, 'fileno'): old_stdout_fileno = sys.stdout.fileno() if hasattr(sys.stderr, 'fileno'): old_stderr_fileno = sys.stderr.fileno() STDOUT_FILENO = 1 STDERR_FILENO = 2 real_stdout = (old_stdout_fileno == STDOUT_FILENO) real_stderr = (old_stderr_fileno == STDERR_FILENO) if real_stdout: hStdout = GetStdHandle(STD_OUTPUT_HANDLE) if not_a_console(hStdout): real_stdout = False if real_stderr: hStderr = GetStdHandle(STD_ERROR_HANDLE) if not_a_console(hStderr): real_stderr = False if real_stdout or real_stderr: # BOOL WINAPI WriteConsoleW(HANDLE hOutput, LPWSTR lpBuffer, DWORD nChars, # LPDWORD lpCharsWritten, LPVOID lpReserved); WriteConsoleW = WINFUNCTYPE(BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(("WriteConsoleW", windll.kernel32)) class UnicodeOutput: def __init__(self, hConsole, stream, fileno, name): self._hConsole = hConsole self._stream = stream self._fileno = fileno self.closed = False self.softspace = False self.mode = 'w' self.encoding = 'utf-8' self.name = name self.flush() def isatty(self): return False def close(self): # don't really close the handle, that would only cause problems self.closed = True def fileno(self): return self._fileno def flush(self): if self._hConsole is None: try: self._stream.flush() except Exception as e: _complain("%s.flush: %r from %r" % (self.name, e, self._stream)) raise def write(self, text): try: if self._hConsole is None: if isinstance(text, unicode): text = text.encode('utf-8') self._stream.write(text) else: if not isinstance(text, unicode): text = str(text).decode('utf-8') remaining = len(text) while remaining: n = DWORD(0) # There is a shorter-than-documented limitation on the # length of the string passed to WriteConsoleW (see # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>. retval = WriteConsoleW(self._hConsole, text, min(remaining, 10000), byref(n), None) if retval == 0 or n.value == 0: raise IOError("WriteConsoleW returned %r, n.value = %r" % (retval, n.value)) remaining -= n.value if not remaining: break text = text[n.value:] except Exception as e: _complain("%s.write: %r" % (self.name, e)) raise def writelines(self, lines): try: for line in lines: self.write(line) except Exception as e: _complain("%s.writelines: %r" % (self.name, e)) raise if real_stdout: sys.stdout = UnicodeOutput(hStdout, None, STDOUT_FILENO, '<Unicode console stdout>') else: sys.stdout = UnicodeOutput(None, sys.stdout, old_stdout_fileno, '<Unicode redirected stdout>') if real_stderr: sys.stderr = UnicodeOutput(hStderr, None, STDERR_FILENO, '<Unicode console stderr>') else: sys.stderr = UnicodeOutput(None, sys.stderr, old_stderr_fileno, '<Unicode redirected stderr>') except Exception as e: _complain("exception %r while fixing up sys.stdout and sys.stderr" % (e,))当然最好还是去源网页参考原来的内容,以免这里有引用错误。