diff options
Diffstat (limited to 'Documentation/filesystems')
22 files changed, 396 insertions, 70 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index f15621ee5599..5139b8c9d5af 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -1,7 +1,5 @@ 00-INDEX - this file (info on some of the filesystems supported by linux). -Exporting - - explanation of how to make filesystems exportable. Locking - info on locking rules as they pertain to Linux VFS. 9p.txt @@ -36,6 +34,8 @@ dnotify.txt - info about directory notification in Linux. ecryptfs.txt - docs on eCryptfs: stacked cryptographic filesystem for Linux. +exofs.txt + - info, usage, mount options, design about EXOFS. ext2.txt - info, mount options and specifications for the Ext2 filesystem. ext3.txt @@ -62,16 +62,14 @@ jfs.txt - info and mount options for the JFS filesystem. locks.txt - info on file locking implementations, flock() vs. fcntl(), etc. +logfs.txt + - info on the LogFS flash filesystem. mandatory-locking.txt - info on the Linux implementation of Sys V mandatory file locking. ncpfs.txt - info on Novell Netware(tm) filesystem using NCP protocol. -nfs41-server.txt - - info on the Linux server implementation of NFSv4 minor version 1. -nfs-rdma.txt - - how to install and setup the Linux NFS/RDMA client and server software. -nfsroot.txt - - short guide on setting up a diskless box with NFS root filesystem. +nfs/ + - nfs-related documentation. nilfs2.txt - info and mount options for the NILFS2 filesystem. ntfs.txt @@ -90,8 +88,6 @@ relay.txt - info on relay, for efficient streaming from kernel to user space. romfs.txt - description of the ROMFS filesystem. -rpc-cache.txt - - introduction to the caching mechanisms in the sunrpc layer. seq_file.txt - how to use the seq_file API sharedsubtree.txt diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 18b9d0ca0630..06bbbed71206 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -460,13 +460,6 @@ in sys_read() and friends. --------------------------- dquot_operations ------------------------------- prototypes: - int (*initialize) (struct inode *, int); - int (*drop) (struct inode *); - int (*alloc_space) (struct inode *, qsize_t, int); - int (*alloc_inode) (const struct inode *, unsigned long); - int (*free_space) (struct inode *, qsize_t); - int (*free_inode) (const struct inode *, unsigned long); - int (*transfer) (struct inode *, struct iattr *); int (*write_dquot) (struct dquot *); int (*acquire_dquot) (struct dquot *); int (*release_dquot) (struct dquot *); @@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations. What filesystem should expect from the generic quota functions: FS recursion Held locks when called -initialize: yes maybe dqonoff_sem -drop: yes - -alloc_space: ->mark_dirty() - -alloc_inode: ->mark_dirty() - -free_space: ->mark_dirty() - -free_inode: ->mark_dirty() - -transfer: yes - write_dquot: yes dqonoff_sem or dqptr_sem acquire_dquot: yes dqonoff_sem or dqptr_sem release_dquot: yes dqonoff_sem or dqptr_sem @@ -495,10 +481,6 @@ write_info: yes dqonoff_sem FS recursion means calling ->quota_read() and ->quota_write() from superblock operations. -->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called -only directly by the filesystem and do not call any fs functions only -the ->mark_dirty() operation. - More details about quota locking can be found in fs/dquot.c. --------------------------- vm_operations_struct ----------------------------- diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt index 4c0c575a4012..79334ed5daa7 100644 --- a/Documentation/filesystems/dentry-locking.txt +++ b/Documentation/filesystems/dentry-locking.txt @@ -62,7 +62,8 @@ changes are : 2. Insertion of a dentry into the hash table is done using hlist_add_head_rcu() which take care of ordering the writes - the writes to the dentry must be visible before the dentry is - inserted. This works in conjunction with hlist_for_each_rcu() while + inserted. This works in conjunction with hlist_for_each_rcu(), + which has since been replaced by hlist_for_each_entry_rcu(), while walking the hash chain. The only requirement is that all initialization to the dentry must be done before hlist_add_head_rcu() since we don't have dcache_lock protection diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt index 0ced74c2f73c..abd2a9b5b787 100644 --- a/Documentation/filesystems/exofs.txt +++ b/Documentation/filesystems/exofs.txt @@ -60,13 +60,13 @@ USAGE mkfs.exofs --pid=65536 --format /dev/osd0 - The --format is optional if not specified no OSD_FORMAT will be - preformed and a clean file system will be created in the specified pid, + The --format is optional. If not specified, no OSD_FORMAT will be + performed and a clean file system will be created in the specified pid, in the available space of the target. (Use --format=size_in_meg to limit the total LUN space available) - If pid already exist it will be deleted and a new one will be created in it's - place. Be careful. + If pid already exists, it will be deleted and a new one will be created in + its place. Be careful. An exofs lives inside a single OSD partition. You can create multiple exofs filesystems on the same device using multiple pids. @@ -81,7 +81,7 @@ USAGE 7. For reference (See do-exofs example script): do-exofs start - an example of how to perform the above steps. - do-exofs stop - an example of how to unmount the file system. + do-exofs stop - an example of how to unmount the file system. do-exofs format - an example of how to format and mkfs a new exofs. 8. Extra compilation flags (uncomment in fs/exofs/Kbuild): @@ -104,8 +104,8 @@ Where: exofs specific options: Options are separated by commas (,) pid=<integer> - The partition number to mount/create as container of the filesystem. - This option is mandatory - to=<integer> - Timeout in ticks for a single command + This option is mandatory. + to=<integer> - Timeout in ticks for a single command. default is (60 * HZ) [for debugging only] =============================================================================== @@ -116,7 +116,7 @@ DESIGN with a special ID (defined in common.h). Information included in the file system control block is used to fill the in-memory superblock structure at mount time. This object is created before - the file system is used by mkexofs.c It contains information such as: + the file system is used by mkexofs.c. It contains information such as: - The file system's magic number - The next inode number to be allocated @@ -134,8 +134,8 @@ DESIGN attributes. This applies to both regular files and other types (directories, device files, symlinks, etc.). -* Credentials are generated per object (inode and superblock) when they is - created in memory (read off disk or created). The credential works for all +* Credentials are generated per object (inode and superblock) when they are + created in memory (read from disk or created). The credential works for all operations and is used as long as the object remains in memory. * Async OSD operations are used whenever possible, but the target may execute @@ -145,7 +145,8 @@ DESIGN from executing in reverse order: - The following are handled with the OBJ_CREATED and OBJ_2BCREATED flags. OBJ_CREATED is set when we know the object exists on the OSD - - in create's callback function, and when we successfully do a read_inode. + in create's callback function, and when we successfully do a + read_inode. OBJ_2BCREATED is set in the beginning of the create function, so we know that we should wait. - create/delete: delete should wait until the object is created diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt index 05d5cf1d743f..867c5b50cb42 100644 --- a/Documentation/filesystems/ext3.txt +++ b/Documentation/filesystems/ext3.txt @@ -32,8 +32,8 @@ journal_dev=devnum When the external journal device's major/minor numbers identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. Note that this forces - mount of inconsistent filesystem, which can lead to +norecovery Don't load the journal on mounting. Note that this forces +noload mount of inconsistent filesystem, which can lead to various problems. data=journal All data are committed into the journal prior to being diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6d94e0696f8c..e1def1786e50 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers identified through its new major/minor numbers encoded in devnum. -noload Don't load the journal on mounting. Note that - if the filesystem was not unmounted cleanly, +norecovery Don't load the journal on mounting. Note that +noload if the filesystem was not unmounted cleanly, skipping the journal replay will lead to the filesystem containing inconsistencies that can lead to any number of problems. @@ -196,7 +196,7 @@ nobarrier This also requires an IO stack which can support also be used to enable or disable barriers, for consistency with other ext4 mount options. -inode_readahead=n This tuning parameter controls the maximum +inode_readahead_blks=n This tuning parameter controls the maximum number of inode table blocks that ext4's inode table readahead algorithm will pre-read into the buffer cache. The default value is 32 blocks. @@ -353,6 +353,12 @@ noauto_da_alloc replacing existing files via patterns such as system crashes before the delayed allocation blocks are forced to disk. +discard Controls whether ext4 should issue discard/TRIM +nodiscard(*) commands to the underlying block device when + blocks are freed. This is useful for SSD devices + and sparse/thinly-provisioned LUNs, but it is off + by default until sufficient testing has been done. + Data Mode ========= There are 3 different data modes: diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt new file mode 100644 index 000000000000..e64c94ba401a --- /dev/null +++ b/Documentation/filesystems/logfs.txt @@ -0,0 +1,241 @@ + +The LogFS Flash Filesystem +========================== + +Specification +============= + +Superblocks +----------- + +Two superblocks exist at the beginning and end of the filesystem. +Each superblock is 256 Bytes large, with another 3840 Bytes reserved +for future purposes, making a total of 4096 Bytes. + +Superblock locations may differ for MTD and block devices. On MTD the +first non-bad block contains a superblock in the first 4096 Bytes and +the last non-bad block contains a superblock in the last 4096 Bytes. +On block devices, the first 4096 Bytes of the device contain the first +superblock and the last aligned 4096 Byte-block contains the second +superblock. + +For the most part, the superblocks can be considered read-only. They +are written only to correct errors detected within the superblocks, +move the journal and change the filesystem parameters through tunefs. +As a result, the superblock does not contain any fields that require +constant updates, like the amount of free space, etc. + +Segments +-------- + +The space in the device is split up into equal-sized segments. +Segments are the primary write unit of LogFS. Within each segments, +writes happen from front (low addresses) to back (high addresses. If +only a partial segment has been written, the segment number, the +current position within and optionally a write buffer are stored in +the journal. + +Segments are erased as a whole. Therefore Garbage Collection may be +required to completely free a segment before doing so. + +Journal +-------- + +The journal contains all global information about the filesystem that +is subject to frequent change. At mount time, it has to be scanned +for the most recent commit entry, which contains a list of pointers to +all currently valid entries. + +Object Store +------------ + +All space except for the superblocks and journal is part of the object +store. Each segment contains a segment header and a number of +objects, each consisting of the object header and the payload. +Objects are either inodes, directory entries (dentries), file data +blocks or indirect blocks. + +Levels +------ + +Garbage collection (GC) may fail if all data is written +indiscriminately. One requirement of GC is that data is seperated +roughly according to the distance between the tree root and the data. +Effectively that means all file data is on level 0, indirect blocks +are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, +respectively. Inode file data is on level 6 for the inodes and 7-11 +for indirect blocks. + +Each segment contains objects of a single level only. As a result, +each level requires its own seperate segment to be open for writing. + +Inode File +---------- + +All inodes are stored in a special file, the inode file. Single +exception is the inode file's inode (master inode) which for obvious +reasons is stored in the journal instead. Instead of data blocks, the +leaf nodes of the inode files are inodes. + +Aliases +------- + +Writes in LogFS are done by means of a wandering tree. A naïve +implementation would require that for each write or a block, all +parent blocks are written as well, since the block pointers have +changed. Such an implementation would not be very efficient. + +In LogFS, the block pointer changes are cached in the journal by means +of alias entries. Each alias consists of its logical address - inode +number, block index, level and child number (index into block) - and +the changed data. Any 8-byte word can be changes in this manner. + +Currently aliases are used for block pointers, file size, file used +bytes and the height of an inodes indirect tree. + +Segment Aliases +--------------- + +Related to regular aliases, these are used to handle bad blocks. +Initially, bad blocks are handled by moving the affected segment +content to a spare segment and noting this move in the journal with a +segment alias, a simple (to, from) tupel. GC will later empty this +segment and the alias can be removed again. This is used on MTD only. + +Vim +--- + +By cleverly predicting the life time of data, it is possible to +seperate long-living data from short-living data and thereby reduce +the GC overhead later. Each type of distinc life expectency (vim) can +have a seperate segment open for writing. Each (level, vim) tupel can +be open just once. If an open segment with unknown vim is encountered +at mount time, it is closed and ignored henceforth. + +Indirect Tree +------------- + +Inodes in LogFS are similar to FFS-style filesystems with direct and +indirect block pointers. One difference is that LogFS uses a single +indirect pointer that can be either a 1x, 2x, etc. indirect pointer. +A height field in the inode defines the height of the indirect tree +and thereby the indirection of the pointer. + +Another difference is the addressing of indirect blocks. In LogFS, +the first 16 pointers in the first indirect block are left empty, +corresponding to the 16 direct pointers in the inode. In ext2 (maybe +others as well) the first pointer in the first indirect block +corresponds to logical block 12, skipping the 12 direct pointers. +So where ext2 is using arithmetic to better utilize space, LogFS keeps +arithmetic simple and uses compression to save space. + +Compression +----------- + +Both file data and metadata can be compressed. Compression for file +data can be enabled with chattr +c and disabled with chattr -c. Doing +so has no effect on existing data, but new data will be stored +accordingly. New inodes will inherit the compression flag of the +parent directory. + +Metadata is always compressed. However, the space accounting ignores +this and charges for the uncompressed size. Failing to do so could +result in GC failures when, after moving some data, indirect blocks +compress worse than previously. Even on a 100% full medium, GC may +not consume any extra space, so the compression gains are lost space +to the user. + +However, they are not lost space to the filesystem internals. By +cheating the user for those bytes, the filesystem gained some slack +space and GC will run less often and faster. + +Garbage Collection and Wear Leveling +------------------------------------ + +Garbage collection is invoked whenever the number of free segments +falls below a threshold. The best (known) candidate is picked based +on the least amount of valid data contained in the segment. All +remaining valid data is copied elsewhere, thereby invalidating it. + +The GC code also checks for aliases and writes then back if their +number gets too large. + +Wear leveling is done by occasionally picking a suboptimal segment for +garbage collection. If a stale segments erase count is significantly +lower than the active segments' erase counts, it will be picked. Wear +leveling is rate limited, so it will never monopolize the device for +more than one segment worth at a time. + +Values for "occasionally", "significantly lower" are compile time +constants. + +Hashed directories +------------------ + +To satisfy efficient lookup(), directory entries are hashed and +located based on the hash. In order to both support large directories +and not be overly inefficient for small directories, several hash +tables of increasing size are used. For each table, the hash value +modulo the table size gives the table index. + +Tables sizes are chosen to limit the number of indirect blocks with a +fully populated table to 0, 1, 2 or 3 respectively. So the first +table contains 16 entries, the second 512-16, etc. + +The last table is special in several ways. First its size depends on +the effective 32bit limit on telldir/seekdir cookies. Since logfs +uses the upper half of the address space for indirect blocks, the size +is limited to 2^31. Secondly the table contains hash buckets with 16 +entries each. + +Using single-entry buckets would result in birthday "attacks". At +just 2^16 used entries, hash collisions would be likely (P >= 0.5). +My math skills are insufficient to do the combinatorics for the 17x +collisions necessary to overflow a bucket, but testing showed that in +10,000 runs the lowest directory fill before a bucket overflow was +188,057,130 entries with an average of 315,149,915 entries. So for +directory sizes of up to a million, bucket overflows should be +virtually impossible under normal circumstances. + +With carefully chosen filenames, it is obviously possible to cause an +overflow with just 21 entries (4 higher tables + 16 entries + 1). So +there may be a security concern if a malicious user has write access +to a directory. + +Open For Discussion +=================== + +Device Address Space +-------------------- + +A device address space is used for caching. Both block devices and +MTD provide functions to either read a single page or write a segment. +Partial segments may be written for data integrity, but where possible +complete segments are written for performance on simple block device +flash media. + +Meta Inodes +----------- + +Inodes are stored in the inode file, which is just a regular file for +most purposes. At umount time, however, the inode file needs to +remain open until all dirty inodes are written. So +generic_shutdown_super() may not close this inode, but shouldn't +complain about remaining inodes due to the inode file either. Same +goes for mapping inode of the device address space. + +Currently logfs uses a hack that essentially copies part of fs/inode.c +code over. A general solution would be preferred. + +Indirect block mapping +---------------------- + +With compression, the block device (or mapping inode) cannot be used +to cache indirect blocks. Some other place is required. Currently +logfs uses the top half of each inode's address space. The low 8TB +(on 32bit) are filled with file data, the high 8TB are used for +indirect blocks. + +One problem is that 16TB files created on 64bit systems actually have +data in the top 8TB. But files >16TB would cause problems anyway, so +only the limit has changed. diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX new file mode 100644 index 000000000000..2f68cd688769 --- /dev/null +++ b/Documentation/filesystems/nfs/00-INDEX @@ -0,0 +1,16 @@ +00-INDEX + - this file (nfs-related documentation). +Exporting + - explanation of how to make filesystems exportable. +knfsd-stats.txt + - statistics which the NFS server makes available to user space. +nfs.txt + - nfs client, and DNS resolution for fs_locations. +nfs41-server.txt + - info on the Linux server implementation of NFSv4 minor version 1. +nfs-rdma.txt + - how to install and setup the Linux NFS/RDMA client and server software +nfsroot.txt + - short guide on setting up a diskless box with NFS root filesystem. +rpc-cache.txt + - introduction to the caching mechanisms in the sunrpc layer. diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/nfs/Exporting index 87019d2b5981..87019d2b5981 100644 --- a/Documentation/filesystems/Exporting +++ b/Documentation/filesystems/nfs/Exporting diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/nfs/knfsd-stats.txt index 64ced5149d37..64ced5149d37 100644 --- a/Documentation/filesystems/knfsd-stats.txt +++ b/Documentation/filesystems/nfs/knfsd-stats.txt diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt index e386f7e4bcee..e386f7e4bcee 100644 --- a/Documentation/filesystems/nfs-rdma.txt +++ b/Documentation/filesystems/nfs/nfs-rdma.txt diff --git a/Documentation/filesystems/nfs.txt b/Documentation/filesystems/nfs/nfs.txt index f50f26ce6cd0..f50f26ce6cd0 100644 --- a/Documentation/filesystems/nfs.txt +++ b/Documentation/filesystems/nfs/nfs.txt diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt index 5920fe26e6ff..6a53a84afc72 100644 --- a/Documentation/filesystems/nfs41-server.txt +++ b/Documentation/filesystems/nfs/nfs41-server.txt @@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4 on or off; rpc.nfsd does this correctly.) The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based -on the latest NFSv4.1 Internet Draft: -http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29 +on RFC 5661. From the many new features in NFSv4.1 the current implementation focuses on the mandatory-to-implement NFSv4.1 Sessions, providing @@ -41,10 +40,10 @@ interoperability problems with future clients. Known issues: conformant with the spec (for example, we don't use kerberos on the backchannel correctly). - no trunking support: no clients currently take advantage of - trunking, but this is a mandatory failure, and its use is + trunking, but this is a mandatory feature, and its use is recommended to clients in a number of places. (E.g. to ensure timely renewal in case an existing connection's retry timeouts - have gotten too long; see section 8.3 of the draft.) + have gotten too long; see section 8.3 of the RFC.) Therefore, lack of this feature may cause future clients to fail. - Incomplete backchannel support: incomplete backchannel gss @@ -213,3 +212,10 @@ The following cases aren't supported yet: DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID. * DESTROY_SESSION MUST be the final operation in the COMPOUND request. +Nonstandard compound limitations: +* No support for a sessions fore channel RPC compound that requires both a + ca_maxrequestsize request and a ca_maxresponsesize reply, so we may + fail to live up to the promise we made in CREATE_SESSION fore channel + negotiation. +* No more than one IO operation (read, write, readdir) allowed per + compound. diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt index 3ba0b945aaf8..3ba0b945aaf8 100644 --- a/Documentation/filesystems/nfsroot.txt +++ b/Documentation/filesystems/nfs/nfsroot.txt diff --git a/Documentation/filesystems/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt index 8a382bea6808..8a382bea6808 100644 --- a/Documentation/filesystems/rpc-cache.txt +++ b/Documentation/filesystems/nfs/rpc-cache.txt diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt index 01539f410676..cf6d0d85ca82 100644 --- a/Documentation/filesystems/nilfs2.txt +++ b/Documentation/filesystems/nilfs2.txt @@ -28,7 +28,7 @@ described in the man pages included in the package. Project web page: http://www.nilfs.org/en/ Download page: http://www.nilfs.org/en/download.html Git tree web page: http://www.nilfs.org/git/ -NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users +List info: http://vger.kernel.org/vger-lists.html#linux-nilfs Caveats ======= @@ -49,8 +49,7 @@ Mount options NILFS2 supports the following mount options: (*) == default -barrier=on(*) This enables/disables barriers. barrier=off disables - it, barrier=on enables it. +nobarrier Disables barriers. errors=continue(*) Keep going on a filesystem error. errors=remount-ro Remount the filesystem read-only on an error. errors=panic Panic and halt the machine if an error occurs. @@ -71,6 +70,13 @@ order=strict Apply strict in-order semantics that preserves sequence blocks. That means, it is guaranteed that no overtaking of events occurs in the recovered file system after a crash. +norecovery Disable recovery of the filesystem on mount. + This disables every write access on the device for + read-only mounts or snapshots. This option will fail + for r/w mounts on an unclean volume. +discard Issue discard/TRIM commands to the underlying block + device when blocks are freed. This is useful for SSD + devices and sparse/thinly-provisioned LUNs. NILFS2 usage ============ diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 92b888d540a6..a7e9746ee7ea 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -140,7 +140,7 @@ Callers of notify_change() need ->i_mutex now. New super_block field "struct export_operations *s_export_op" for explicit support for exporting, e.g. via NFS. The structure is fully documented at its declaration in include/linux/fs.h, and in -Documentation/filesystems/Exporting. +Documentation/filesystems/nfs/Exporting. Briefly it allows for the definition of decode_fh and encode_fh operations to encode and decode filehandles, and allows the filesystem to use diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 2c48f945546b..96a44dd95e03 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -38,6 +38,7 @@ Table of Contents 3.3 /proc/<pid>/io - Display the IO accounting fields 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings 3.5 /proc/<pid>/mountinfo - Information about mounts + 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm ------------------------------------------------------------------------------ @@ -163,6 +164,7 @@ read the file /proc/PID/status: VmExe: 68 kB VmLib: 1412 kB VmPTE: 20 kb + VmSwap: 0 kB Threads: 1 SigQ: 0/28578 SigPnd: 0000000000000000 @@ -176,7 +178,6 @@ read the file /proc/PID/status: CapBnd: ffffffffffffffff voluntary_ctxt_switches: 0 nonvoluntary_ctxt_switches: 1 - Stack usage: 12 kB This shows you nearly the same information you would get if you viewed it with the ps command. In fact, ps uses the proc file system to obtain its @@ -188,6 +189,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file contains details information about the process itself. Its fields are explained in Table 1-4. +(for SMP CONFIG users) +For making accounting scalable, RSS related information are handled in +asynchronous manner and the vaule may not be very precise. To see a precise +snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table. +It's slow but very precise. + Table 1-2: Contents of the statm files (as of 2.6.30-rc7) .............................................................................. Field Content @@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7) VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries + VmSwap size of swap usage (the number of referred swapents) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread @@ -230,7 +238,6 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7) Mems_allowed_list Same as previous, but in "list format" voluntary_ctxt_switches number of voluntary context switches nonvoluntary_ctxt_switches number of non voluntary context switches - Stack usage: stack usage high water mark (round up to page size) .............................................................................. Table 1-3: Contents of the statm files (as of 2.6.8-rc3) @@ -431,6 +438,7 @@ Table 1-5: Kernel info in /proc modules List of loaded modules mounts Mounted filesystems net Networking info (see text) + pagetypeinfo Additional page allocator information (see text) (2.5) partitions Table of partitions known to the system pci Deprecated info of PCI bus (new way -> /proc/bus/pci/, decoupled by lspci (2.4) @@ -585,7 +593,7 @@ Node 0, zone DMA 0 4 5 4 4 3 ... Node 0, zone Normal 1 0 0 1 101 8 ... Node 0, zone HighMem 2 0 0 1 1 0 ... -Memory fragmentation is a problem under some workloads, and buddyinfo is a +External fragmentation is a problem under some workloads, and buddyinfo is a useful tool for helping diagnose these problems. Buddyinfo will give you a clue as to how big an area you can safely allocate, or why a previous allocation failed. @@ -595,6 +603,48 @@ available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE available in ZONE_NORMAL, etc... +More information relevant to external fragmentation can be found in +pagetypeinfo. + +> cat /proc/pagetypeinfo +Page block order: 9 +Pages per block: 512 + +Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 +Node 0, zone DMA, type Unmovable 0 0 0 1 1 1 1 1 1 1 0 +Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 +Node 0, zone DMA, type Movable 1 1 2 1 2 1 1 0 1 0 2 +Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 1 0 +Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0 +Node 0, zone DMA32, type Unmovable 103 54 77 1 1 1 11 8 7 1 9 +Node 0, zone DMA32, type Reclaimable 0 0 2 1 0 0 0 0 1 0 0 +Node 0, zone DMA32, type Movable 169 152 113 91 77 54 39 13 6 1 452 +Node 0, zone DMA32, type Reserve 1 2 2 2 2 0 1 1 1 1 0 +Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0 + +Number of blocks type Unmovable Reclaimable Movable Reserve Isolate +Node 0, zone DMA 2 0 5 1 0 +Node 0, zone DMA32 41 6 967 2 0 + +Fragmentation avoidance in the kernel works by grouping pages of different +migrate types into the same contiguous regions of memory called page blocks. +A page block is typically the size of the default hugepage size e.g. 2MB on +X86-64. By keeping pages grouped based on their ability to move, the kernel +can reclaim pages within a page block to satisfy a high-order allocation. + +The pagetypinfo begins with information on the size of a page block. It +then gives the same type of information as buddyinfo except broken down +by migrate-type and finishes with details on how many page blocks of each +type exist. + +If min_free_kbytes has been tuned correctly (recommendations made by hugeadm +from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can +make an estimate of the likely number of huge pages that can be allocated +at a given point in time. All the "Movable" blocks should be allocatable +unless memory has been mlock()'d. Some of the Reclaimable blocks should +also be allocatable although a lot of filesystem metadata may have to be +reclaimed to achieve this. + .............................................................................. meminfo: @@ -1072,7 +1122,8 @@ second). The meanings of the columns are as follows, from left to right: - irq: servicing interrupts - softirq: servicing softirqs - steal: involuntary wait -- guest: running a guest +- guest: running a normal guest +- guest_nice: running a niced guest The "intr" line gives counts of interrupts serviced since boot time, for each of the possible system interrupts. The first column is the total of all @@ -1088,8 +1139,8 @@ The "processes" line gives the number of processes and threads created, which includes (but is not limited to) those created by calls to the fork() and clone() system calls. -The "procs_running" line gives the number of processes currently running on -CPUs. +The "procs_running" line gives the total number of threads that are +running or ready to run (i.e., the total number of runnable threads). The "procs_blocked" line gives the number of processes currently blocked, waiting for I/O to complete. @@ -1408,3 +1459,11 @@ For more information on mount propagation see: Documentation/filesystems/sharedsubtree.txt + +3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm +-------------------------------------------------------- +These files provide a method to access a tasks comm value. It also allows for +a task to set its own or one of its thread siblings comm value. The comm value +is limited in size compared to the cmdline value, so writing anything longer +then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated +comm value. diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt index 0d15ebccf5b0..a1e2e0dda907 100644 --- a/Documentation/filesystems/seq_file.txt +++ b/Documentation/filesystems/seq_file.txt @@ -248,9 +248,7 @@ code, that is done in the initialization code in the usual way: { struct proc_dir_entry *entry; - entry = create_proc_entry("sequence", 0, NULL); - if (entry) - entry->proc_fops = &ct_file_ops; + proc_create("sequence", 0, NULL, &ct_file_ops); return 0; } diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt index 23a181074f94..fc0e39af43c3 100644 --- a/Documentation/filesystems/sharedsubtree.txt +++ b/Documentation/filesystems/sharedsubtree.txt @@ -837,6 +837,9 @@ replicas continue to be exactly same. individual lists does not affect propagation or the way propagation tree is modified by operations. + All vfsmounts in a peer group have the same ->mnt_master. If it is + non-NULL, they form a contiguous (ordered) segment of slave list. + A example propagation tree looks as shown in the figure below. [ NOTE: Though it looks like a forest, if we consider all the shared mounts as a conceptual entity called 'pnode', it becomes a tree] @@ -874,8 +877,19 @@ replicas continue to be exactly same. NOTE: The propagation tree is orthogonal to the mount tree. +8B Locking: + + ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected + by namespace_sem (exclusive for modifications, shared for reading). + + Normally we have ->mnt_flags modifications serialized by vfsmount_lock. + There are two exceptions: do_add_mount() and clone_mnt(). + The former modifies a vfsmount that has not been visible in any shared + data structures yet. + The latter holds namespace_sem and the only references to vfsmount + are in lists that can't be traversed without namespace_sem. -8B Algorithm: +8C Algorithm: The crux of the implementation resides in rbind/move operation. diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index b245d524d568..931c806642c5 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -91,8 +91,8 @@ struct device_attribute { const char *buf, size_t count); }; -int device_create_file(struct device *, struct device_attribute *); -void device_remove_file(struct device *, struct device_attribute *); +int device_create_file(struct device *, const struct device_attribute *); +void device_remove_file(struct device *, const struct device_attribute *); It also defines this helper for defining device attributes: @@ -316,8 +316,8 @@ DEVICE_ATTR(_name, _mode, _show, _store); Creation/Removal: -int device_create_file(struct device *device, struct device_attribute * attr); -void device_remove_file(struct device * dev, struct device_attribute * attr); +int device_create_file(struct device *dev, const struct device_attribute * attr); +void device_remove_file(struct device *dev, const struct device_attribute * attr); - bus drivers (include/linux/device.h) @@ -358,7 +358,7 @@ DRIVER_ATTR(_name, _mode, _show, _store) Creation/Removal: -int driver_create_file(struct device_driver *, struct driver_attribute *); -void driver_remove_file(struct device_driver *, struct driver_attribute *); +int driver_create_file(struct device_driver *, const struct driver_attribute *); +void driver_remove_file(struct device_driver *, const struct driver_attribute *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 623f094c9d8d..3de2f32edd90 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -472,7 +472,7 @@ __sync_single_inode) to check if ->writepages has been successful in writing out the whole address_space. The Writeback tag is used by filemap*wait* and sync_page* functions, -via wait_on_page_writeback_range, to wait for all writeback to +via filemap_fdatawait_range, to wait for all writeback to complete. While waiting ->sync_page (if defined) will be called on each page that is found to require writeback. |
