btree.c 285 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522
  1. /*
  2. ** 2004 April 6
  3. **
  4. ** The author disclaims copyright to this source code. In place of
  5. ** a legal notice, here is a blessing:
  6. **
  7. ** May you do good and not evil.
  8. ** May you find forgiveness for yourself and forgive others.
  9. ** May you share freely, never taking more than you give.
  10. **
  11. *************************************************************************
  12. ** This file implements a external (disk-based) database using BTrees.
  13. ** See the header comment on "btreeInt.h" for additional information.
  14. ** Including a description of file format and an overview of operation.
  15. */
  16. #include "btreeInt.h"
  17. /*
  18. ** The header string that appears at the beginning of every
  19. ** SQLite database.
  20. */
  21. static const char zMagicHeader[] = SQLITE_FILE_HEADER;
  22. /*
  23. ** Set this global variable to 1 to enable tracing using the TRACE
  24. ** macro.
  25. */
  26. #if 0
  27. int sqlite3BtreeTrace=1; /* True to enable tracing */
  28. # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
  29. #else
  30. # define TRACE(X)
  31. #endif
  32. /*
  33. ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
  34. ** But if the value is zero, make it 65536.
  35. **
  36. ** This routine is used to extract the "offset to cell content area" value
  37. ** from the header of a btree page. If the page size is 65536 and the page
  38. ** is empty, the offset should be 65536, but the 2-byte value stores zero.
  39. ** This routine makes the necessary adjustment to 65536.
  40. */
  41. #define get2byteNotZero(X) (((((int)get2byte(X))-1)&0xffff)+1)
  42. /*
  43. ** Values passed as the 5th argument to allocateBtreePage()
  44. */
  45. #define BTALLOC_ANY 0 /* Allocate any page */
  46. #define BTALLOC_EXACT 1 /* Allocate exact page if possible */
  47. #define BTALLOC_LE 2 /* Allocate any page <= the parameter */
  48. /*
  49. ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
  50. ** defined, or 0 if it is. For example:
  51. **
  52. ** bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
  53. */
  54. #ifndef SQLITE_OMIT_AUTOVACUUM
  55. #define IfNotOmitAV(expr) (expr)
  56. #else
  57. #define IfNotOmitAV(expr) 0
  58. #endif
  59. #ifndef SQLITE_OMIT_SHARED_CACHE
  60. /*
  61. ** A list of BtShared objects that are eligible for participation
  62. ** in shared cache. This variable has file scope during normal builds,
  63. ** but the test harness needs to access it so we make it global for
  64. ** test builds.
  65. **
  66. ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
  67. */
  68. #ifdef SQLITE_TEST
  69. BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
  70. #else
  71. static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
  72. #endif
  73. #endif /* SQLITE_OMIT_SHARED_CACHE */
  74. #ifndef SQLITE_OMIT_SHARED_CACHE
  75. /*
  76. ** Enable or disable the shared pager and schema features.
  77. **
  78. ** This routine has no effect on existing database connections.
  79. ** The shared cache setting effects only future calls to
  80. ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
  81. */
  82. int sqlite3_enable_shared_cache(int enable){
  83. sqlite3GlobalConfig.sharedCacheEnabled = enable;
  84. return SQLITE_OK;
  85. }
  86. #endif
  87. #ifdef SQLITE_OMIT_SHARED_CACHE
  88. /*
  89. ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
  90. ** and clearAllSharedCacheTableLocks()
  91. ** manipulate entries in the BtShared.pLock linked list used to store
  92. ** shared-cache table level locks. If the library is compiled with the
  93. ** shared-cache feature disabled, then there is only ever one user
  94. ** of each BtShared structure and so this locking is not necessary.
  95. ** So define the lock related functions as no-ops.
  96. */
  97. #define querySharedCacheTableLock(a,b,c) SQLITE_OK
  98. #define setSharedCacheTableLock(a,b,c) SQLITE_OK
  99. #define clearAllSharedCacheTableLocks(a)
  100. #define downgradeAllSharedCacheTableLocks(a)
  101. #define hasSharedCacheTableLock(a,b,c,d) 1
  102. #define hasReadConflicts(a, b) 0
  103. #endif
  104. #ifndef SQLITE_OMIT_SHARED_CACHE
  105. #ifdef SQLITE_DEBUG
  106. /*
  107. **** This function is only used as part of an assert() statement. ***
  108. **
  109. ** Check to see if pBtree holds the required locks to read or write to the
  110. ** table with root page iRoot. Return 1 if it does and 0 if not.
  111. **
  112. ** For example, when writing to a table with root-page iRoot via
  113. ** Btree connection pBtree:
  114. **
  115. ** assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
  116. **
  117. ** When writing to an index that resides in a sharable database, the
  118. ** caller should have first obtained a lock specifying the root page of
  119. ** the corresponding table. This makes things a bit more complicated,
  120. ** as this module treats each table as a separate structure. To determine
  121. ** the table corresponding to the index being written, this
  122. ** function has to search through the database schema.
  123. **
  124. ** Instead of a lock on the table/index rooted at page iRoot, the caller may
  125. ** hold a write-lock on the schema table (root page 1). This is also
  126. ** acceptable.
  127. */
  128. static int hasSharedCacheTableLock(
  129. Btree *pBtree, /* Handle that must hold lock */
  130. Pgno iRoot, /* Root page of b-tree */
  131. int isIndex, /* True if iRoot is the root of an index b-tree */
  132. int eLockType /* Required lock type (READ_LOCK or WRITE_LOCK) */
  133. ){
  134. Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
  135. Pgno iTab = 0;
  136. BtLock *pLock;
  137. /* If this database is not shareable, or if the client is reading
  138. ** and has the read-uncommitted flag set, then no lock is required.
  139. ** Return true immediately.
  140. */
  141. if( (pBtree->sharable==0)
  142. || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
  143. ){
  144. return 1;
  145. }
  146. /* If the client is reading or writing an index and the schema is
  147. ** not loaded, then it is too difficult to actually check to see if
  148. ** the correct locks are held. So do not bother - just return true.
  149. ** This case does not come up very often anyhow.
  150. */
  151. if( isIndex && (!pSchema || (pSchema->flags&DB_SchemaLoaded)==0) ){
  152. return 1;
  153. }
  154. /* Figure out the root-page that the lock should be held on. For table
  155. ** b-trees, this is just the root page of the b-tree being read or
  156. ** written. For index b-trees, it is the root page of the associated
  157. ** table. */
  158. if( isIndex ){
  159. HashElem *p;
  160. for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
  161. Index *pIdx = (Index *)sqliteHashData(p);
  162. if( pIdx->tnum==(int)iRoot ){
  163. iTab = pIdx->pTable->tnum;
  164. }
  165. }
  166. }else{
  167. iTab = iRoot;
  168. }
  169. /* Search for the required lock. Either a write-lock on root-page iTab, a
  170. ** write-lock on the schema table, or (if the client is reading) a
  171. ** read-lock on iTab will suffice. Return 1 if any of these are found. */
  172. for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
  173. if( pLock->pBtree==pBtree
  174. && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
  175. && pLock->eLock>=eLockType
  176. ){
  177. return 1;
  178. }
  179. }
  180. /* Failed to find the required lock. */
  181. return 0;
  182. }
  183. #endif /* SQLITE_DEBUG */
  184. #ifdef SQLITE_DEBUG
  185. /*
  186. **** This function may be used as part of assert() statements only. ****
  187. **
  188. ** Return true if it would be illegal for pBtree to write into the
  189. ** table or index rooted at iRoot because other shared connections are
  190. ** simultaneously reading that same table or index.
  191. **
  192. ** It is illegal for pBtree to write if some other Btree object that
  193. ** shares the same BtShared object is currently reading or writing
  194. ** the iRoot table. Except, if the other Btree object has the
  195. ** read-uncommitted flag set, then it is OK for the other object to
  196. ** have a read cursor.
  197. **
  198. ** For example, before writing to any part of the table or index
  199. ** rooted at page iRoot, one should call:
  200. **
  201. ** assert( !hasReadConflicts(pBtree, iRoot) );
  202. */
  203. static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
  204. BtCursor *p;
  205. for(p=pBtree->pBt->pCursor; p; p=p->pNext){
  206. if( p->pgnoRoot==iRoot
  207. && p->pBtree!=pBtree
  208. && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
  209. ){
  210. return 1;
  211. }
  212. }
  213. return 0;
  214. }
  215. #endif /* #ifdef SQLITE_DEBUG */
  216. /*
  217. ** Query to see if Btree handle p may obtain a lock of type eLock
  218. ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
  219. ** SQLITE_OK if the lock may be obtained (by calling
  220. ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
  221. */
  222. static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
  223. BtShared *pBt = p->pBt;
  224. BtLock *pIter;
  225. assert( sqlite3BtreeHoldsMutex(p) );
  226. assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
  227. assert( p->db!=0 );
  228. assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
  229. /* If requesting a write-lock, then the Btree must have an open write
  230. ** transaction on this file. And, obviously, for this to be so there
  231. ** must be an open write transaction on the file itself.
  232. */
  233. assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
  234. assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
  235. /* This routine is a no-op if the shared-cache is not enabled */
  236. if( !p->sharable ){
  237. return SQLITE_OK;
  238. }
  239. /* If some other connection is holding an exclusive lock, the
  240. ** requested lock may not be obtained.
  241. */
  242. if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
  243. sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
  244. return SQLITE_LOCKED_SHAREDCACHE;
  245. }
  246. for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
  247. /* The condition (pIter->eLock!=eLock) in the following if(...)
  248. ** statement is a simplification of:
  249. **
  250. ** (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
  251. **
  252. ** since we know that if eLock==WRITE_LOCK, then no other connection
  253. ** may hold a WRITE_LOCK on any table in this file (since there can
  254. ** only be a single writer).
  255. */
  256. assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
  257. assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
  258. if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
  259. sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
  260. if( eLock==WRITE_LOCK ){
  261. assert( p==pBt->pWriter );
  262. pBt->btsFlags |= BTS_PENDING;
  263. }
  264. return SQLITE_LOCKED_SHAREDCACHE;
  265. }
  266. }
  267. return SQLITE_OK;
  268. }
  269. #endif /* !SQLITE_OMIT_SHARED_CACHE */
  270. #ifndef SQLITE_OMIT_SHARED_CACHE
  271. /*
  272. ** Add a lock on the table with root-page iTable to the shared-btree used
  273. ** by Btree handle p. Parameter eLock must be either READ_LOCK or
  274. ** WRITE_LOCK.
  275. **
  276. ** This function assumes the following:
  277. **
  278. ** (a) The specified Btree object p is connected to a sharable
  279. ** database (one with the BtShared.sharable flag set), and
  280. **
  281. ** (b) No other Btree objects hold a lock that conflicts
  282. ** with the requested lock (i.e. querySharedCacheTableLock() has
  283. ** already been called and returned SQLITE_OK).
  284. **
  285. ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
  286. ** is returned if a malloc attempt fails.
  287. */
  288. static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
  289. BtShared *pBt = p->pBt;
  290. BtLock *pLock = 0;
  291. BtLock *pIter;
  292. assert( sqlite3BtreeHoldsMutex(p) );
  293. assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
  294. assert( p->db!=0 );
  295. /* A connection with the read-uncommitted flag set will never try to
  296. ** obtain a read-lock using this function. The only read-lock obtained
  297. ** by a connection in read-uncommitted mode is on the sqlite_master
  298. ** table, and that lock is obtained in BtreeBeginTrans(). */
  299. assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
  300. /* This function should only be called on a sharable b-tree after it
  301. ** has been determined that no other b-tree holds a conflicting lock. */
  302. assert( p->sharable );
  303. assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
  304. /* First search the list for an existing lock on this table. */
  305. for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
  306. if( pIter->iTable==iTable && pIter->pBtree==p ){
  307. pLock = pIter;
  308. break;
  309. }
  310. }
  311. /* If the above search did not find a BtLock struct associating Btree p
  312. ** with table iTable, allocate one and link it into the list.
  313. */
  314. if( !pLock ){
  315. pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
  316. if( !pLock ){
  317. return SQLITE_NOMEM;
  318. }
  319. pLock->iTable = iTable;
  320. pLock->pBtree = p;
  321. pLock->pNext = pBt->pLock;
  322. pBt->pLock = pLock;
  323. }
  324. /* Set the BtLock.eLock variable to the maximum of the current lock
  325. ** and the requested lock. This means if a write-lock was already held
  326. ** and a read-lock requested, we don't incorrectly downgrade the lock.
  327. */
  328. assert( WRITE_LOCK>READ_LOCK );
  329. if( eLock>pLock->eLock ){
  330. pLock->eLock = eLock;
  331. }
  332. return SQLITE_OK;
  333. }
  334. #endif /* !SQLITE_OMIT_SHARED_CACHE */
  335. #ifndef SQLITE_OMIT_SHARED_CACHE
  336. /*
  337. ** Release all the table locks (locks obtained via calls to
  338. ** the setSharedCacheTableLock() procedure) held by Btree object p.
  339. **
  340. ** This function assumes that Btree p has an open read or write
  341. ** transaction. If it does not, then the BTS_PENDING flag
  342. ** may be incorrectly cleared.
  343. */
  344. static void clearAllSharedCacheTableLocks(Btree *p){
  345. BtShared *pBt = p->pBt;
  346. BtLock **ppIter = &pBt->pLock;
  347. assert( sqlite3BtreeHoldsMutex(p) );
  348. assert( p->sharable || 0==*ppIter );
  349. assert( p->inTrans>0 );
  350. while( *ppIter ){
  351. BtLock *pLock = *ppIter;
  352. assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
  353. assert( pLock->pBtree->inTrans>=pLock->eLock );
  354. if( pLock->pBtree==p ){
  355. *ppIter = pLock->pNext;
  356. assert( pLock->iTable!=1 || pLock==&p->lock );
  357. if( pLock->iTable!=1 ){
  358. sqlite3_free(pLock);
  359. }
  360. }else{
  361. ppIter = &pLock->pNext;
  362. }
  363. }
  364. assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
  365. if( pBt->pWriter==p ){
  366. pBt->pWriter = 0;
  367. pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
  368. }else if( pBt->nTransaction==2 ){
  369. /* This function is called when Btree p is concluding its
  370. ** transaction. If there currently exists a writer, and p is not
  371. ** that writer, then the number of locks held by connections other
  372. ** than the writer must be about to drop to zero. In this case
  373. ** set the BTS_PENDING flag to 0.
  374. **
  375. ** If there is not currently a writer, then BTS_PENDING must
  376. ** be zero already. So this next line is harmless in that case.
  377. */
  378. pBt->btsFlags &= ~BTS_PENDING;
  379. }
  380. }
  381. /*
  382. ** This function changes all write-locks held by Btree p into read-locks.
  383. */
  384. static void downgradeAllSharedCacheTableLocks(Btree *p){
  385. BtShared *pBt = p->pBt;
  386. if( pBt->pWriter==p ){
  387. BtLock *pLock;
  388. pBt->pWriter = 0;
  389. pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
  390. for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
  391. assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
  392. pLock->eLock = READ_LOCK;
  393. }
  394. }
  395. }
  396. #endif /* SQLITE_OMIT_SHARED_CACHE */
  397. static void releasePage(MemPage *pPage); /* Forward reference */
  398. /*
  399. ***** This routine is used inside of assert() only ****
  400. **
  401. ** Verify that the cursor holds the mutex on its BtShared
  402. */
  403. #ifdef SQLITE_DEBUG
  404. static int cursorHoldsMutex(BtCursor *p){
  405. return sqlite3_mutex_held(p->pBt->mutex);
  406. }
  407. #endif
  408. #ifndef SQLITE_OMIT_INCRBLOB
  409. /*
  410. ** Invalidate the overflow page-list cache for cursor pCur, if any.
  411. */
  412. static void invalidateOverflowCache(BtCursor *pCur){
  413. assert( cursorHoldsMutex(pCur) );
  414. sqlite3_free(pCur->aOverflow);
  415. pCur->aOverflow = 0;
  416. }
  417. /*
  418. ** Invalidate the overflow page-list cache for all cursors opened
  419. ** on the shared btree structure pBt.
  420. */
  421. static void invalidateAllOverflowCache(BtShared *pBt){
  422. BtCursor *p;
  423. assert( sqlite3_mutex_held(pBt->mutex) );
  424. for(p=pBt->pCursor; p; p=p->pNext){
  425. invalidateOverflowCache(p);
  426. }
  427. }
  428. /*
  429. ** This function is called before modifying the contents of a table
  430. ** to invalidate any incrblob cursors that are open on the
  431. ** row or one of the rows being modified.
  432. **
  433. ** If argument isClearTable is true, then the entire contents of the
  434. ** table is about to be deleted. In this case invalidate all incrblob
  435. ** cursors open on any row within the table with root-page pgnoRoot.
  436. **
  437. ** Otherwise, if argument isClearTable is false, then the row with
  438. ** rowid iRow is being replaced or deleted. In this case invalidate
  439. ** only those incrblob cursors open on that specific row.
  440. */
  441. static void invalidateIncrblobCursors(
  442. Btree *pBtree, /* The database file to check */
  443. i64 iRow, /* The rowid that might be changing */
  444. int isClearTable /* True if all rows are being deleted */
  445. ){
  446. BtCursor *p;
  447. BtShared *pBt = pBtree->pBt;
  448. assert( sqlite3BtreeHoldsMutex(pBtree) );
  449. for(p=pBt->pCursor; p; p=p->pNext){
  450. if( p->isIncrblobHandle && (isClearTable || p->info.nKey==iRow) ){
  451. p->eState = CURSOR_INVALID;
  452. }
  453. }
  454. }
  455. #else
  456. /* Stub functions when INCRBLOB is omitted */
  457. #define invalidateOverflowCache(x)
  458. #define invalidateAllOverflowCache(x)
  459. #define invalidateIncrblobCursors(x,y,z)
  460. #endif /* SQLITE_OMIT_INCRBLOB */
  461. /*
  462. ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
  463. ** when a page that previously contained data becomes a free-list leaf
  464. ** page.
  465. **
  466. ** The BtShared.pHasContent bitvec exists to work around an obscure
  467. ** bug caused by the interaction of two useful IO optimizations surrounding
  468. ** free-list leaf pages:
  469. **
  470. ** 1) When all data is deleted from a page and the page becomes
  471. ** a free-list leaf page, the page is not written to the database
  472. ** (as free-list leaf pages contain no meaningful data). Sometimes
  473. ** such a page is not even journalled (as it will not be modified,
  474. ** why bother journalling it?).
  475. **
  476. ** 2) When a free-list leaf page is reused, its content is not read
  477. ** from the database or written to the journal file (why should it
  478. ** be, if it is not at all meaningful?).
  479. **
  480. ** By themselves, these optimizations work fine and provide a handy
  481. ** performance boost to bulk delete or insert operations. However, if
  482. ** a page is moved to the free-list and then reused within the same
  483. ** transaction, a problem comes up. If the page is not journalled when
  484. ** it is moved to the free-list and it is also not journalled when it
  485. ** is extracted from the free-list and reused, then the original data
  486. ** may be lost. In the event of a rollback, it may not be possible
  487. ** to restore the database to its original configuration.
  488. **
  489. ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
  490. ** moved to become a free-list leaf page, the corresponding bit is
  491. ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
  492. ** optimization 2 above is omitted if the corresponding bit is already
  493. ** set in BtShared.pHasContent. The contents of the bitvec are cleared
  494. ** at the end of every transaction.
  495. */
  496. static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
  497. int rc = SQLITE_OK;
  498. if( !pBt->pHasContent ){
  499. assert( pgno<=pBt->nPage );
  500. pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
  501. if( !pBt->pHasContent ){
  502. rc = SQLITE_NOMEM;
  503. }
  504. }
  505. if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
  506. rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
  507. }
  508. return rc;
  509. }
  510. /*
  511. ** Query the BtShared.pHasContent vector.
  512. **
  513. ** This function is called when a free-list leaf page is removed from the
  514. ** free-list for reuse. It returns false if it is safe to retrieve the
  515. ** page from the pager layer with the 'no-content' flag set. True otherwise.
  516. */
  517. static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
  518. Bitvec *p = pBt->pHasContent;
  519. return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
  520. }
  521. /*
  522. ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
  523. ** invoked at the conclusion of each write-transaction.
  524. */
  525. static void btreeClearHasContent(BtShared *pBt){
  526. sqlite3BitvecDestroy(pBt->pHasContent);
  527. pBt->pHasContent = 0;
  528. }
  529. /*
  530. ** Release all of the apPage[] pages for a cursor.
  531. */
  532. static void btreeReleaseAllCursorPages(BtCursor *pCur){
  533. int i;
  534. for(i=0; i<=pCur->iPage; i++){
  535. releasePage(pCur->apPage[i]);
  536. pCur->apPage[i] = 0;
  537. }
  538. pCur->iPage = -1;
  539. }
  540. /*
  541. ** Save the current cursor position in the variables BtCursor.nKey
  542. ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
  543. **
  544. ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
  545. ** prior to calling this routine.
  546. */
  547. static int saveCursorPosition(BtCursor *pCur){
  548. int rc;
  549. assert( CURSOR_VALID==pCur->eState );
  550. assert( 0==pCur->pKey );
  551. assert( cursorHoldsMutex(pCur) );
  552. rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
  553. assert( rc==SQLITE_OK ); /* KeySize() cannot fail */
  554. /* If this is an intKey table, then the above call to BtreeKeySize()
  555. ** stores the integer key in pCur->nKey. In this case this value is
  556. ** all that is required. Otherwise, if pCur is not open on an intKey
  557. ** table, then malloc space for and store the pCur->nKey bytes of key
  558. ** data.
  559. */
  560. if( 0==pCur->apPage[0]->intKey ){
  561. void *pKey = sqlite3Malloc( (int)pCur->nKey );
  562. if( pKey ){
  563. rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
  564. if( rc==SQLITE_OK ){
  565. pCur->pKey = pKey;
  566. }else{
  567. sqlite3_free(pKey);
  568. }
  569. }else{
  570. rc = SQLITE_NOMEM;
  571. }
  572. }
  573. assert( !pCur->apPage[0]->intKey || !pCur->pKey );
  574. if( rc==SQLITE_OK ){
  575. btreeReleaseAllCursorPages(pCur);
  576. pCur->eState = CURSOR_REQUIRESEEK;
  577. }
  578. invalidateOverflowCache(pCur);
  579. return rc;
  580. }
  581. /*
  582. ** Save the positions of all cursors (except pExcept) that are open on
  583. ** the table with root-page iRoot. Usually, this is called just before cursor
  584. ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
  585. */
  586. static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
  587. BtCursor *p;
  588. assert( sqlite3_mutex_held(pBt->mutex) );
  589. assert( pExcept==0 || pExcept->pBt==pBt );
  590. for(p=pBt->pCursor; p; p=p->pNext){
  591. if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
  592. if( p->eState==CURSOR_VALID ){
  593. int rc = saveCursorPosition(p);
  594. if( SQLITE_OK!=rc ){
  595. return rc;
  596. }
  597. }else{
  598. testcase( p->iPage>0 );
  599. btreeReleaseAllCursorPages(p);
  600. }
  601. }
  602. }
  603. return SQLITE_OK;
  604. }
  605. /*
  606. ** Clear the current cursor position.
  607. */
  608. void sqlite3BtreeClearCursor(BtCursor *pCur){
  609. assert( cursorHoldsMutex(pCur) );
  610. sqlite3_free(pCur->pKey);
  611. pCur->pKey = 0;
  612. pCur->eState = CURSOR_INVALID;
  613. }
  614. /*
  615. ** In this version of BtreeMoveto, pKey is a packed index record
  616. ** such as is generated by the OP_MakeRecord opcode. Unpack the
  617. ** record and then call BtreeMovetoUnpacked() to do the work.
  618. */
  619. static int btreeMoveto(
  620. BtCursor *pCur, /* Cursor open on the btree to be searched */
  621. const void *pKey, /* Packed key if the btree is an index */
  622. i64 nKey, /* Integer key for tables. Size of pKey for indices */
  623. int bias, /* Bias search to the high end */
  624. int *pRes /* Write search results here */
  625. ){
  626. int rc; /* Status code */
  627. UnpackedRecord *pIdxKey; /* Unpacked index key */
  628. char aSpace[150]; /* Temp space for pIdxKey - to avoid a malloc */
  629. char *pFree = 0;
  630. if( pKey ){
  631. assert( nKey==(i64)(int)nKey );
  632. pIdxKey = sqlite3VdbeAllocUnpackedRecord(
  633. pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree
  634. );
  635. if( pIdxKey==0 ) return SQLITE_NOMEM;
  636. sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
  637. }else{
  638. pIdxKey = 0;
  639. }
  640. rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
  641. if( pFree ){
  642. sqlite3DbFree(pCur->pKeyInfo->db, pFree);
  643. }
  644. return rc;
  645. }
  646. /*
  647. ** Restore the cursor to the position it was in (or as close to as possible)
  648. ** when saveCursorPosition() was called. Note that this call deletes the
  649. ** saved position info stored by saveCursorPosition(), so there can be
  650. ** at most one effective restoreCursorPosition() call after each
  651. ** saveCursorPosition().
  652. */
  653. static int btreeRestoreCursorPosition(BtCursor *pCur){
  654. int rc;
  655. assert( cursorHoldsMutex(pCur) );
  656. assert( pCur->eState>=CURSOR_REQUIRESEEK );
  657. if( pCur->eState==CURSOR_FAULT ){
  658. return pCur->skipNext;
  659. }
  660. pCur->eState = CURSOR_INVALID;
  661. rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
  662. if( rc==SQLITE_OK ){
  663. sqlite3_free(pCur->pKey);
  664. pCur->pKey = 0;
  665. assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
  666. if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
  667. pCur->eState = CURSOR_SKIPNEXT;
  668. }
  669. }
  670. return rc;
  671. }
  672. #define restoreCursorPosition(p) \
  673. (p->eState>=CURSOR_REQUIRESEEK ? \
  674. btreeRestoreCursorPosition(p) : \
  675. SQLITE_OK)
  676. /*
  677. ** Determine whether or not a cursor has moved from the position it
  678. ** was last placed at. Cursors can move when the row they are pointing
  679. ** at is deleted out from under them.
  680. **
  681. ** This routine returns an error code if something goes wrong. The
  682. ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
  683. */
  684. int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
  685. int rc;
  686. rc = restoreCursorPosition(pCur);
  687. if( rc ){
  688. *pHasMoved = 1;
  689. return rc;
  690. }
  691. if( pCur->eState!=CURSOR_VALID || NEVER(pCur->skipNext!=0) ){
  692. *pHasMoved = 1;
  693. }else{
  694. *pHasMoved = 0;
  695. }
  696. return SQLITE_OK;
  697. }
  698. #ifndef SQLITE_OMIT_AUTOVACUUM
  699. /*
  700. ** Given a page number of a regular database page, return the page
  701. ** number for the pointer-map page that contains the entry for the
  702. ** input page number.
  703. **
  704. ** Return 0 (not a valid page) for pgno==1 since there is
  705. ** no pointer map associated with page 1. The integrity_check logic
  706. ** requires that ptrmapPageno(*,1)!=1.
  707. */
  708. static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
  709. int nPagesPerMapPage;
  710. Pgno iPtrMap, ret;
  711. assert( sqlite3_mutex_held(pBt->mutex) );
  712. if( pgno<2 ) return 0;
  713. nPagesPerMapPage = (pBt->usableSize/5)+1;
  714. iPtrMap = (pgno-2)/nPagesPerMapPage;
  715. ret = (iPtrMap*nPagesPerMapPage) + 2;
  716. if( ret==PENDING_BYTE_PAGE(pBt) ){
  717. ret++;
  718. }
  719. return ret;
  720. }
  721. /*
  722. ** Write an entry into the pointer map.
  723. **
  724. ** This routine updates the pointer map entry for page number 'key'
  725. ** so that it maps to type 'eType' and parent page number 'pgno'.
  726. **
  727. ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
  728. ** a no-op. If an error occurs, the appropriate error code is written
  729. ** into *pRC.
  730. */
  731. static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
  732. DbPage *pDbPage; /* The pointer map page */
  733. u8 *pPtrmap; /* The pointer map data */
  734. Pgno iPtrmap; /* The pointer map page number */
  735. int offset; /* Offset in pointer map page */
  736. int rc; /* Return code from subfunctions */
  737. if( *pRC ) return;
  738. assert( sqlite3_mutex_held(pBt->mutex) );
  739. /* The master-journal page number must never be used as a pointer map page */
  740. assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
  741. assert( pBt->autoVacuum );
  742. if( key==0 ){
  743. *pRC = SQLITE_CORRUPT_BKPT;
  744. return;
  745. }
  746. iPtrmap = PTRMAP_PAGENO(pBt, key);
  747. rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
  748. if( rc!=SQLITE_OK ){
  749. *pRC = rc;
  750. return;
  751. }
  752. offset = PTRMAP_PTROFFSET(iPtrmap, key);
  753. if( offset<0 ){
  754. *pRC = SQLITE_CORRUPT_BKPT;
  755. goto ptrmap_exit;
  756. }
  757. assert( offset <= (int)pBt->usableSize-5 );
  758. pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
  759. if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
  760. TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
  761. *pRC= rc = sqlite3PagerWrite(pDbPage);
  762. if( rc==SQLITE_OK ){
  763. pPtrmap[offset] = eType;
  764. put4byte(&pPtrmap[offset+1], parent);
  765. }
  766. }
  767. ptrmap_exit:
  768. sqlite3PagerUnref(pDbPage);
  769. }
  770. /*
  771. ** Read an entry from the pointer map.
  772. **
  773. ** This routine retrieves the pointer map entry for page 'key', writing
  774. ** the type and parent page number to *pEType and *pPgno respectively.
  775. ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
  776. */
  777. static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
  778. DbPage *pDbPage; /* The pointer map page */
  779. int iPtrmap; /* Pointer map page index */
  780. u8 *pPtrmap; /* Pointer map page data */
  781. int offset; /* Offset of entry in pointer map */
  782. int rc;
  783. assert( sqlite3_mutex_held(pBt->mutex) );
  784. iPtrmap = PTRMAP_PAGENO(pBt, key);
  785. rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
  786. if( rc!=0 ){
  787. return rc;
  788. }
  789. pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
  790. offset = PTRMAP_PTROFFSET(iPtrmap, key);
  791. if( offset<0 ){
  792. sqlite3PagerUnref(pDbPage);
  793. return SQLITE_CORRUPT_BKPT;
  794. }
  795. assert( offset <= (int)pBt->usableSize-5 );
  796. assert( pEType!=0 );
  797. *pEType = pPtrmap[offset];
  798. if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
  799. sqlite3PagerUnref(pDbPage);
  800. if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
  801. return SQLITE_OK;
  802. }
  803. #else /* if defined SQLITE_OMIT_AUTOVACUUM */
  804. #define ptrmapPut(w,x,y,z,rc)
  805. #define ptrmapGet(w,x,y,z) SQLITE_OK
  806. #define ptrmapPutOvflPtr(x, y, rc)
  807. #endif
  808. /*
  809. ** Given a btree page and a cell index (0 means the first cell on
  810. ** the page, 1 means the second cell, and so forth) return a pointer
  811. ** to the cell content.
  812. **
  813. ** This routine works only for pages that do not contain overflow cells.
  814. */
  815. #define findCell(P,I) \
  816. ((P)->aData + ((P)->maskPage & get2byte(&(P)->aCellIdx[2*(I)])))
  817. #define findCellv2(D,M,O,I) (D+(M&get2byte(D+(O+2*(I)))))
  818. /*
  819. ** This a more complex version of findCell() that works for
  820. ** pages that do contain overflow cells.
  821. */
  822. static u8 *findOverflowCell(MemPage *pPage, int iCell){
  823. int i;
  824. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  825. for(i=pPage->nOverflow-1; i>=0; i--){
  826. int k;
  827. k = pPage->aiOvfl[i];
  828. if( k<=iCell ){
  829. if( k==iCell ){
  830. return pPage->apOvfl[i];
  831. }
  832. iCell--;
  833. }
  834. }
  835. return findCell(pPage, iCell);
  836. }
  837. /*
  838. ** Parse a cell content block and fill in the CellInfo structure. There
  839. ** are two versions of this function. btreeParseCell() takes a
  840. ** cell index as the second argument and btreeParseCellPtr()
  841. ** takes a pointer to the body of the cell as its second argument.
  842. **
  843. ** Within this file, the parseCell() macro can be called instead of
  844. ** btreeParseCellPtr(). Using some compilers, this will be faster.
  845. */
  846. static void btreeParseCellPtr(
  847. MemPage *pPage, /* Page containing the cell */
  848. u8 *pCell, /* Pointer to the cell text. */
  849. CellInfo *pInfo /* Fill in this structure */
  850. ){
  851. u16 n; /* Number bytes in cell content header */
  852. u32 nPayload; /* Number of bytes of cell payload */
  853. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  854. pInfo->pCell = pCell;
  855. assert( pPage->leaf==0 || pPage->leaf==1 );
  856. n = pPage->childPtrSize;
  857. assert( n==4-4*pPage->leaf );
  858. if( pPage->intKey ){
  859. if( pPage->hasData ){
  860. assert( n==0 );
  861. n = getVarint32(pCell, nPayload);
  862. }else{
  863. nPayload = 0;
  864. }
  865. n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
  866. pInfo->nData = nPayload;
  867. }else{
  868. pInfo->nData = 0;
  869. n += getVarint32(&pCell[n], nPayload);
  870. pInfo->nKey = nPayload;
  871. }
  872. pInfo->nPayload = nPayload;
  873. pInfo->nHeader = n;
  874. testcase( nPayload==pPage->maxLocal );
  875. testcase( nPayload==pPage->maxLocal+1 );
  876. if( likely(nPayload<=pPage->maxLocal) ){
  877. /* This is the (easy) common case where the entire payload fits
  878. ** on the local page. No overflow is required.
  879. */
  880. if( (pInfo->nSize = (u16)(n+nPayload))<4 ) pInfo->nSize = 4;
  881. pInfo->nLocal = (u16)nPayload;
  882. pInfo->iOverflow = 0;
  883. }else{
  884. /* If the payload will not fit completely on the local page, we have
  885. ** to decide how much to store locally and how much to spill onto
  886. ** overflow pages. The strategy is to minimize the amount of unused
  887. ** space on overflow pages while keeping the amount of local storage
  888. ** in between minLocal and maxLocal.
  889. **
  890. ** Warning: changing the way overflow payload is distributed in any
  891. ** way will result in an incompatible file format.
  892. */
  893. int minLocal; /* Minimum amount of payload held locally */
  894. int maxLocal; /* Maximum amount of payload held locally */
  895. int surplus; /* Overflow payload available for local storage */
  896. minLocal = pPage->minLocal;
  897. maxLocal = pPage->maxLocal;
  898. surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
  899. testcase( surplus==maxLocal );
  900. testcase( surplus==maxLocal+1 );
  901. if( surplus <= maxLocal ){
  902. pInfo->nLocal = (u16)surplus;
  903. }else{
  904. pInfo->nLocal = (u16)minLocal;
  905. }
  906. pInfo->iOverflow = (u16)(pInfo->nLocal + n);
  907. pInfo->nSize = pInfo->iOverflow + 4;
  908. }
  909. }
  910. #define parseCell(pPage, iCell, pInfo) \
  911. btreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
  912. static void btreeParseCell(
  913. MemPage *pPage, /* Page containing the cell */
  914. int iCell, /* The cell index. First cell is 0 */
  915. CellInfo *pInfo /* Fill in this structure */
  916. ){
  917. parseCell(pPage, iCell, pInfo);
  918. }
  919. /*
  920. ** Compute the total number of bytes that a Cell needs in the cell
  921. ** data area of the btree-page. The return number includes the cell
  922. ** data header and the local payload, but not any overflow page or
  923. ** the space used by the cell pointer.
  924. */
  925. static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
  926. u8 *pIter = &pCell[pPage->childPtrSize];
  927. u32 nSize;
  928. #ifdef SQLITE_DEBUG
  929. /* The value returned by this function should always be the same as
  930. ** the (CellInfo.nSize) value found by doing a full parse of the
  931. ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
  932. ** this function verifies that this invariant is not violated. */
  933. CellInfo debuginfo;
  934. btreeParseCellPtr(pPage, pCell, &debuginfo);
  935. #endif
  936. if( pPage->intKey ){
  937. u8 *pEnd;
  938. if( pPage->hasData ){
  939. pIter += getVarint32(pIter, nSize);
  940. }else{
  941. nSize = 0;
  942. }
  943. /* pIter now points at the 64-bit integer key value, a variable length
  944. ** integer. The following block moves pIter to point at the first byte
  945. ** past the end of the key value. */
  946. pEnd = &pIter[9];
  947. while( (*pIter++)&0x80 && pIter<pEnd );
  948. }else{
  949. pIter += getVarint32(pIter, nSize);
  950. }
  951. testcase( nSize==pPage->maxLocal );
  952. testcase( nSize==pPage->maxLocal+1 );
  953. if( nSize>pPage->maxLocal ){
  954. int minLocal = pPage->minLocal;
  955. nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
  956. testcase( nSize==pPage->maxLocal );
  957. testcase( nSize==pPage->maxLocal+1 );
  958. if( nSize>pPage->maxLocal ){
  959. nSize = minLocal;
  960. }
  961. nSize += 4;
  962. }
  963. nSize += (u32)(pIter - pCell);
  964. /* The minimum size of any cell is 4 bytes. */
  965. if( nSize<4 ){
  966. nSize = 4;
  967. }
  968. assert( nSize==debuginfo.nSize );
  969. return (u16)nSize;
  970. }
  971. #ifdef SQLITE_DEBUG
  972. /* This variation on cellSizePtr() is used inside of assert() statements
  973. ** only. */
  974. static u16 cellSize(MemPage *pPage, int iCell){
  975. return cellSizePtr(pPage, findCell(pPage, iCell));
  976. }
  977. #endif
  978. #ifndef SQLITE_OMIT_AUTOVACUUM
  979. /*
  980. ** If the cell pCell, part of page pPage contains a pointer
  981. ** to an overflow page, insert an entry into the pointer-map
  982. ** for the overflow page.
  983. */
  984. static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
  985. CellInfo info;
  986. if( *pRC ) return;
  987. assert( pCell!=0 );
  988. btreeParseCellPtr(pPage, pCell, &info);
  989. assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
  990. if( info.iOverflow ){
  991. Pgno ovfl = get4byte(&pCell[info.iOverflow]);
  992. ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
  993. }
  994. }
  995. #endif
  996. /*
  997. ** Defragment the page given. All Cells are moved to the
  998. ** end of the page and all free space is collected into one
  999. ** big FreeBlk that occurs in between the header and cell
  1000. ** pointer array and the cell content area.
  1001. */
  1002. static int defragmentPage(MemPage *pPage){
  1003. int i; /* Loop counter */
  1004. int pc; /* Address of a i-th cell */
  1005. int hdr; /* Offset to the page header */
  1006. int size; /* Size of a cell */
  1007. int usableSize; /* Number of usable bytes on a page */
  1008. int cellOffset; /* Offset to the cell pointer array */
  1009. int cbrk; /* Offset to the cell content area */
  1010. int nCell; /* Number of cells on the page */
  1011. unsigned char *data; /* The page data */
  1012. unsigned char *temp; /* Temp area for cell content */
  1013. int iCellFirst; /* First allowable cell index */
  1014. int iCellLast; /* Last possible cell index */
  1015. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1016. assert( pPage->pBt!=0 );
  1017. assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
  1018. assert( pPage->nOverflow==0 );
  1019. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1020. temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
  1021. data = pPage->aData;
  1022. hdr = pPage->hdrOffset;
  1023. cellOffset = pPage->cellOffset;
  1024. nCell = pPage->nCell;
  1025. assert( nCell==get2byte(&data[hdr+3]) );
  1026. usableSize = pPage->pBt->usableSize;
  1027. cbrk = get2byte(&data[hdr+5]);
  1028. memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
  1029. cbrk = usableSize;
  1030. iCellFirst = cellOffset + 2*nCell;
  1031. iCellLast = usableSize - 4;
  1032. for(i=0; i<nCell; i++){
  1033. u8 *pAddr; /* The i-th cell pointer */
  1034. pAddr = &data[cellOffset + i*2];
  1035. pc = get2byte(pAddr);
  1036. testcase( pc==iCellFirst );
  1037. testcase( pc==iCellLast );
  1038. #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
  1039. /* These conditions have already been verified in btreeInitPage()
  1040. ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined
  1041. */
  1042. if( pc<iCellFirst || pc>iCellLast ){
  1043. return SQLITE_CORRUPT_BKPT;
  1044. }
  1045. #endif
  1046. assert( pc>=iCellFirst && pc<=iCellLast );
  1047. size = cellSizePtr(pPage, &temp[pc]);
  1048. cbrk -= size;
  1049. #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
  1050. if( cbrk<iCellFirst ){
  1051. return SQLITE_CORRUPT_BKPT;
  1052. }
  1053. #else
  1054. if( cbrk<iCellFirst || pc+size>usableSize ){
  1055. return SQLITE_CORRUPT_BKPT;
  1056. }
  1057. #endif
  1058. assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
  1059. testcase( cbrk+size==usableSize );
  1060. testcase( pc+size==usableSize );
  1061. memcpy(&data[cbrk], &temp[pc], size);
  1062. put2byte(pAddr, cbrk);
  1063. }
  1064. assert( cbrk>=iCellFirst );
  1065. put2byte(&data[hdr+5], cbrk);
  1066. data[hdr+1] = 0;
  1067. data[hdr+2] = 0;
  1068. data[hdr+7] = 0;
  1069. memset(&data[iCellFirst], 0, cbrk-iCellFirst);
  1070. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1071. if( cbrk-iCellFirst!=pPage->nFree ){
  1072. return SQLITE_CORRUPT_BKPT;
  1073. }
  1074. return SQLITE_OK;
  1075. }
  1076. /*
  1077. ** Allocate nByte bytes of space from within the B-Tree page passed
  1078. ** as the first argument. Write into *pIdx the index into pPage->aData[]
  1079. ** of the first byte of allocated space. Return either SQLITE_OK or
  1080. ** an error code (usually SQLITE_CORRUPT).
  1081. **
  1082. ** The caller guarantees that there is sufficient space to make the
  1083. ** allocation. This routine might need to defragment in order to bring
  1084. ** all the space together, however. This routine will avoid using
  1085. ** the first two bytes past the cell pointer area since presumably this
  1086. ** allocation is being made in order to insert a new cell, so we will
  1087. ** also end up needing a new cell pointer.
  1088. */
  1089. static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
  1090. const int hdr = pPage->hdrOffset; /* Local cache of pPage->hdrOffset */
  1091. u8 * const data = pPage->aData; /* Local cache of pPage->aData */
  1092. int nFrag; /* Number of fragmented bytes on pPage */
  1093. int top; /* First byte of cell content area */
  1094. int gap; /* First byte of gap between cell pointers and cell content */
  1095. int rc; /* Integer return code */
  1096. int usableSize; /* Usable size of the page */
  1097. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1098. assert( pPage->pBt );
  1099. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1100. assert( nByte>=0 ); /* Minimum cell size is 4 */
  1101. assert( pPage->nFree>=nByte );
  1102. assert( pPage->nOverflow==0 );
  1103. usableSize = pPage->pBt->usableSize;
  1104. assert( nByte < usableSize-8 );
  1105. nFrag = data[hdr+7];
  1106. assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
  1107. gap = pPage->cellOffset + 2*pPage->nCell;
  1108. top = get2byteNotZero(&data[hdr+5]);
  1109. if( gap>top ) return SQLITE_CORRUPT_BKPT;
  1110. testcase( gap+2==top );
  1111. testcase( gap+1==top );
  1112. testcase( gap==top );
  1113. if( nFrag>=60 ){
  1114. /* Always defragment highly fragmented pages */
  1115. rc = defragmentPage(pPage);
  1116. if( rc ) return rc;
  1117. top = get2byteNotZero(&data[hdr+5]);
  1118. }else if( gap+2<=top ){
  1119. /* Search the freelist looking for a free slot big enough to satisfy
  1120. ** the request. The allocation is made from the first free slot in
  1121. ** the list that is large enough to accommodate it.
  1122. */
  1123. int pc, addr;
  1124. for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
  1125. int size; /* Size of the free slot */
  1126. if( pc>usableSize-4 || pc<addr+4 ){
  1127. return SQLITE_CORRUPT_BKPT;
  1128. }
  1129. size = get2byte(&data[pc+2]);
  1130. if( size>=nByte ){
  1131. int x = size - nByte;
  1132. testcase( x==4 );
  1133. testcase( x==3 );
  1134. if( x<4 ){
  1135. /* Remove the slot from the free-list. Update the number of
  1136. ** fragmented bytes within the page. */
  1137. memcpy(&data[addr], &data[pc], 2);
  1138. data[hdr+7] = (u8)(nFrag + x);
  1139. }else if( size+pc > usableSize ){
  1140. return SQLITE_CORRUPT_BKPT;
  1141. }else{
  1142. /* The slot remains on the free-list. Reduce its size to account
  1143. ** for the portion used by the new allocation. */
  1144. put2byte(&data[pc+2], x);
  1145. }
  1146. *pIdx = pc + x;
  1147. return SQLITE_OK;
  1148. }
  1149. }
  1150. }
  1151. /* Check to make sure there is enough space in the gap to satisfy
  1152. ** the allocation. If not, defragment.
  1153. */
  1154. testcase( gap+2+nByte==top );
  1155. if( gap+2+nByte>top ){
  1156. rc = defragmentPage(pPage);
  1157. if( rc ) return rc;
  1158. top = get2byteNotZero(&data[hdr+5]);
  1159. assert( gap+nByte<=top );
  1160. }
  1161. /* Allocate memory from the gap in between the cell pointer array
  1162. ** and the cell content area. The btreeInitPage() call has already
  1163. ** validated the freelist. Given that the freelist is valid, there
  1164. ** is no way that the allocation can extend off the end of the page.
  1165. ** The assert() below verifies the previous sentence.
  1166. */
  1167. top -= nByte;
  1168. put2byte(&data[hdr+5], top);
  1169. assert( top+nByte <= (int)pPage->pBt->usableSize );
  1170. *pIdx = top;
  1171. return SQLITE_OK;
  1172. }
  1173. /*
  1174. ** Return a section of the pPage->aData to the freelist.
  1175. ** The first byte of the new free block is pPage->aDisk[start]
  1176. ** and the size of the block is "size" bytes.
  1177. **
  1178. ** Most of the effort here is involved in coalesing adjacent
  1179. ** free blocks into a single big free block.
  1180. */
  1181. static int freeSpace(MemPage *pPage, int start, int size){
  1182. int addr, pbegin, hdr;
  1183. int iLast; /* Largest possible freeblock offset */
  1184. unsigned char *data = pPage->aData;
  1185. assert( pPage->pBt!=0 );
  1186. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1187. assert( start>=pPage->hdrOffset+6+pPage->childPtrSize );
  1188. assert( (start + size) <= (int)pPage->pBt->usableSize );
  1189. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1190. assert( size>=0 ); /* Minimum cell size is 4 */
  1191. if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
  1192. /* Overwrite deleted information with zeros when the secure_delete
  1193. ** option is enabled */
  1194. memset(&data[start], 0, size);
  1195. }
  1196. /* Add the space back into the linked list of freeblocks. Note that
  1197. ** even though the freeblock list was checked by btreeInitPage(),
  1198. ** btreeInitPage() did not detect overlapping cells or
  1199. ** freeblocks that overlapped cells. Nor does it detect when the
  1200. ** cell content area exceeds the value in the page header. If these
  1201. ** situations arise, then subsequent insert operations might corrupt
  1202. ** the freelist. So we do need to check for corruption while scanning
  1203. ** the freelist.
  1204. */
  1205. hdr = pPage->hdrOffset;
  1206. addr = hdr + 1;
  1207. iLast = pPage->pBt->usableSize - 4;
  1208. assert( start<=iLast );
  1209. while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
  1210. if( pbegin<addr+4 ){
  1211. return SQLITE_CORRUPT_BKPT;
  1212. }
  1213. addr = pbegin;
  1214. }
  1215. if( pbegin>iLast ){
  1216. return SQLITE_CORRUPT_BKPT;
  1217. }
  1218. assert( pbegin>addr || pbegin==0 );
  1219. put2byte(&data[addr], start);
  1220. put2byte(&data[start], pbegin);
  1221. put2byte(&data[start+2], size);
  1222. pPage->nFree = pPage->nFree + (u16)size;
  1223. /* Coalesce adjacent free blocks */
  1224. addr = hdr + 1;
  1225. while( (pbegin = get2byte(&data[addr]))>0 ){
  1226. int pnext, psize, x;
  1227. assert( pbegin>addr );
  1228. assert( pbegin <= (int)pPage->pBt->usableSize-4 );
  1229. pnext = get2byte(&data[pbegin]);
  1230. psize = get2byte(&data[pbegin+2]);
  1231. if( pbegin + psize + 3 >= pnext && pnext>0 ){
  1232. int frag = pnext - (pbegin+psize);
  1233. if( (frag<0) || (frag>(int)data[hdr+7]) ){
  1234. return SQLITE_CORRUPT_BKPT;
  1235. }
  1236. data[hdr+7] -= (u8)frag;
  1237. x = get2byte(&data[pnext]);
  1238. put2byte(&data[pbegin], x);
  1239. x = pnext + get2byte(&data[pnext+2]) - pbegin;
  1240. put2byte(&data[pbegin+2], x);
  1241. }else{
  1242. addr = pbegin;
  1243. }
  1244. }
  1245. /* If the cell content area begins with a freeblock, remove it. */
  1246. if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
  1247. int top;
  1248. pbegin = get2byte(&data[hdr+1]);
  1249. memcpy(&data[hdr+1], &data[pbegin], 2);
  1250. top = get2byte(&data[hdr+5]) + get2byte(&data[pbegin+2]);
  1251. put2byte(&data[hdr+5], top);
  1252. }
  1253. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1254. return SQLITE_OK;
  1255. }
  1256. /*
  1257. ** Decode the flags byte (the first byte of the header) for a page
  1258. ** and initialize fields of the MemPage structure accordingly.
  1259. **
  1260. ** Only the following combinations are supported. Anything different
  1261. ** indicates a corrupt database files:
  1262. **
  1263. ** PTF_ZERODATA
  1264. ** PTF_ZERODATA | PTF_LEAF
  1265. ** PTF_LEAFDATA | PTF_INTKEY
  1266. ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
  1267. */
  1268. static int decodeFlags(MemPage *pPage, int flagByte){
  1269. BtShared *pBt; /* A copy of pPage->pBt */
  1270. assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
  1271. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1272. pPage->leaf = (u8)(flagByte>>3); assert( PTF_LEAF == 1<<3 );
  1273. flagByte &= ~PTF_LEAF;
  1274. pPage->childPtrSize = 4-4*pPage->leaf;
  1275. pBt = pPage->pBt;
  1276. if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
  1277. pPage->intKey = 1;
  1278. pPage->hasData = pPage->leaf;
  1279. pPage->maxLocal = pBt->maxLeaf;
  1280. pPage->minLocal = pBt->minLeaf;
  1281. }else if( flagByte==PTF_ZERODATA ){
  1282. pPage->intKey = 0;
  1283. pPage->hasData = 0;
  1284. pPage->maxLocal = pBt->maxLocal;
  1285. pPage->minLocal = pBt->minLocal;
  1286. }else{
  1287. return SQLITE_CORRUPT_BKPT;
  1288. }
  1289. pPage->max1bytePayload = pBt->max1bytePayload;
  1290. return SQLITE_OK;
  1291. }
  1292. /*
  1293. ** Initialize the auxiliary information for a disk block.
  1294. **
  1295. ** Return SQLITE_OK on success. If we see that the page does
  1296. ** not contain a well-formed database page, then return
  1297. ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
  1298. ** guarantee that the page is well-formed. It only shows that
  1299. ** we failed to detect any corruption.
  1300. */
  1301. static int btreeInitPage(MemPage *pPage){
  1302. assert( pPage->pBt!=0 );
  1303. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1304. assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
  1305. assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
  1306. assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
  1307. if( !pPage->isInit ){
  1308. u16 pc; /* Address of a freeblock within pPage->aData[] */
  1309. u8 hdr; /* Offset to beginning of page header */
  1310. u8 *data; /* Equal to pPage->aData */
  1311. BtShared *pBt; /* The main btree structure */
  1312. int usableSize; /* Amount of usable space on each page */
  1313. u16 cellOffset; /* Offset from start of page to first cell pointer */
  1314. int nFree; /* Number of unused bytes on the page */
  1315. int top; /* First byte of the cell content area */
  1316. int iCellFirst; /* First allowable cell or freeblock offset */
  1317. int iCellLast; /* Last possible cell or freeblock offset */
  1318. pBt = pPage->pBt;
  1319. hdr = pPage->hdrOffset;
  1320. data = pPage->aData;
  1321. if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
  1322. assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
  1323. pPage->maskPage = (u16)(pBt->pageSize - 1);
  1324. pPage->nOverflow = 0;
  1325. usableSize = pBt->usableSize;
  1326. pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
  1327. pPage->aDataEnd = &data[usableSize];
  1328. pPage->aCellIdx = &data[cellOffset];
  1329. top = get2byteNotZero(&data[hdr+5]);
  1330. pPage->nCell = get2byte(&data[hdr+3]);
  1331. if( pPage->nCell>MX_CELL(pBt) ){
  1332. /* To many cells for a single page. The page must be corrupt */
  1333. return SQLITE_CORRUPT_BKPT;
  1334. }
  1335. testcase( pPage->nCell==MX_CELL(pBt) );
  1336. /* A malformed database page might cause us to read past the end
  1337. ** of page when parsing a cell.
  1338. **
  1339. ** The following block of code checks early to see if a cell extends
  1340. ** past the end of a page boundary and causes SQLITE_CORRUPT to be
  1341. ** returned if it does.
  1342. */
  1343. iCellFirst = cellOffset + 2*pPage->nCell;
  1344. iCellLast = usableSize - 4;
  1345. #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
  1346. {
  1347. int i; /* Index into the cell pointer array */
  1348. int sz; /* Size of a cell */
  1349. if( !pPage->leaf ) iCellLast--;
  1350. for(i=0; i<pPage->nCell; i++){
  1351. pc = get2byte(&data[cellOffset+i*2]);
  1352. testcase( pc==iCellFirst );
  1353. testcase( pc==iCellLast );
  1354. if( pc<iCellFirst || pc>iCellLast ){
  1355. return SQLITE_CORRUPT_BKPT;
  1356. }
  1357. sz = cellSizePtr(pPage, &data[pc]);
  1358. testcase( pc+sz==usableSize );
  1359. if( pc+sz>usableSize ){
  1360. return SQLITE_CORRUPT_BKPT;
  1361. }
  1362. }
  1363. if( !pPage->leaf ) iCellLast++;
  1364. }
  1365. #endif
  1366. /* Compute the total free space on the page */
  1367. pc = get2byte(&data[hdr+1]);
  1368. nFree = data[hdr+7] + top;
  1369. while( pc>0 ){
  1370. u16 next, size;
  1371. if( pc<iCellFirst || pc>iCellLast ){
  1372. /* Start of free block is off the page */
  1373. return SQLITE_CORRUPT_BKPT;
  1374. }
  1375. next = get2byte(&data[pc]);
  1376. size = get2byte(&data[pc+2]);
  1377. if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
  1378. /* Free blocks must be in ascending order. And the last byte of
  1379. ** the free-block must lie on the database page. */
  1380. return SQLITE_CORRUPT_BKPT;
  1381. }
  1382. nFree = nFree + size;
  1383. pc = next;
  1384. }
  1385. /* At this point, nFree contains the sum of the offset to the start
  1386. ** of the cell-content area plus the number of free bytes within
  1387. ** the cell-content area. If this is greater than the usable-size
  1388. ** of the page, then the page must be corrupted. This check also
  1389. ** serves to verify that the offset to the start of the cell-content
  1390. ** area, according to the page header, lies within the page.
  1391. */
  1392. if( nFree>usableSize ){
  1393. return SQLITE_CORRUPT_BKPT;
  1394. }
  1395. pPage->nFree = (u16)(nFree - iCellFirst);
  1396. pPage->isInit = 1;
  1397. }
  1398. return SQLITE_OK;
  1399. }
  1400. /*
  1401. ** Set up a raw page so that it looks like a database page holding
  1402. ** no entries.
  1403. */
  1404. static void zeroPage(MemPage *pPage, int flags){
  1405. unsigned char *data = pPage->aData;
  1406. BtShared *pBt = pPage->pBt;
  1407. u8 hdr = pPage->hdrOffset;
  1408. u16 first;
  1409. assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
  1410. assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  1411. assert( sqlite3PagerGetData(pPage->pDbPage) == data );
  1412. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1413. assert( sqlite3_mutex_held(pBt->mutex) );
  1414. if( pBt->btsFlags & BTS_SECURE_DELETE ){
  1415. memset(&data[hdr], 0, pBt->usableSize - hdr);
  1416. }
  1417. data[hdr] = (char)flags;
  1418. first = hdr + 8 + 4*((flags&PTF_LEAF)==0 ?1:0);
  1419. memset(&data[hdr+1], 0, 4);
  1420. data[hdr+7] = 0;
  1421. put2byte(&data[hdr+5], pBt->usableSize);
  1422. pPage->nFree = (u16)(pBt->usableSize - first);
  1423. decodeFlags(pPage, flags);
  1424. pPage->hdrOffset = hdr;
  1425. pPage->cellOffset = first;
  1426. pPage->aDataEnd = &data[pBt->usableSize];
  1427. pPage->aCellIdx = &data[first];
  1428. pPage->nOverflow = 0;
  1429. assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
  1430. pPage->maskPage = (u16)(pBt->pageSize - 1);
  1431. pPage->nCell = 0;
  1432. pPage->isInit = 1;
  1433. }
  1434. /*
  1435. ** Convert a DbPage obtained from the pager into a MemPage used by
  1436. ** the btree layer.
  1437. */
  1438. static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
  1439. MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
  1440. pPage->aData = sqlite3PagerGetData(pDbPage);
  1441. pPage->pDbPage = pDbPage;
  1442. pPage->pBt = pBt;
  1443. pPage->pgno = pgno;
  1444. pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
  1445. return pPage;
  1446. }
  1447. /*
  1448. ** Get a page from the pager. Initialize the MemPage.pBt and
  1449. ** MemPage.aData elements if needed.
  1450. **
  1451. ** If the noContent flag is set, it means that we do not care about
  1452. ** the content of the page at this time. So do not go to the disk
  1453. ** to fetch the content. Just fill in the content with zeros for now.
  1454. ** If in the future we call sqlite3PagerWrite() on this page, that
  1455. ** means we have started to be concerned about content and the disk
  1456. ** read should occur at that point.
  1457. */
  1458. static int btreeGetPage(
  1459. BtShared *pBt, /* The btree */
  1460. Pgno pgno, /* Number of the page to fetch */
  1461. MemPage **ppPage, /* Return the page in this parameter */
  1462. int flags /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
  1463. ){
  1464. int rc;
  1465. DbPage *pDbPage;
  1466. assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
  1467. assert( sqlite3_mutex_held(pBt->mutex) );
  1468. rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
  1469. if( rc ) return rc;
  1470. *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
  1471. return SQLITE_OK;
  1472. }
  1473. /*
  1474. ** Retrieve a page from the pager cache. If the requested page is not
  1475. ** already in the pager cache return NULL. Initialize the MemPage.pBt and
  1476. ** MemPage.aData elements if needed.
  1477. */
  1478. static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
  1479. DbPage *pDbPage;
  1480. assert( sqlite3_mutex_held(pBt->mutex) );
  1481. pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
  1482. if( pDbPage ){
  1483. return btreePageFromDbPage(pDbPage, pgno, pBt);
  1484. }
  1485. return 0;
  1486. }
  1487. /*
  1488. ** Return the size of the database file in pages. If there is any kind of
  1489. ** error, return ((unsigned int)-1).
  1490. */
  1491. static Pgno btreePagecount(BtShared *pBt){
  1492. return pBt->nPage;
  1493. }
  1494. u32 sqlite3BtreeLastPage(Btree *p){
  1495. assert( sqlite3BtreeHoldsMutex(p) );
  1496. assert( ((p->pBt->nPage)&0x8000000)==0 );
  1497. return (int)btreePagecount(p->pBt);
  1498. }
  1499. /*
  1500. ** Get a page from the pager and initialize it. This routine is just a
  1501. ** convenience wrapper around separate calls to btreeGetPage() and
  1502. ** btreeInitPage().
  1503. **
  1504. ** If an error occurs, then the value *ppPage is set to is undefined. It
  1505. ** may remain unchanged, or it may be set to an invalid value.
  1506. */
  1507. static int getAndInitPage(
  1508. BtShared *pBt, /* The database file */
  1509. Pgno pgno, /* Number of the page to get */
  1510. MemPage **ppPage, /* Write the page pointer here */
  1511. int bReadonly /* PAGER_GET_READONLY or 0 */
  1512. ){
  1513. int rc;
  1514. assert( sqlite3_mutex_held(pBt->mutex) );
  1515. assert( bReadonly==PAGER_GET_READONLY || bReadonly==0 );
  1516. if( pgno>btreePagecount(pBt) ){
  1517. rc = SQLITE_CORRUPT_BKPT;
  1518. }else{
  1519. rc = btreeGetPage(pBt, pgno, ppPage, bReadonly);
  1520. if( rc==SQLITE_OK ){
  1521. rc = btreeInitPage(*ppPage);
  1522. if( rc!=SQLITE_OK ){
  1523. releasePage(*ppPage);
  1524. }
  1525. }
  1526. }
  1527. testcase( pgno==0 );
  1528. assert( pgno!=0 || rc==SQLITE_CORRUPT );
  1529. return rc;
  1530. }
  1531. /*
  1532. ** Release a MemPage. This should be called once for each prior
  1533. ** call to btreeGetPage.
  1534. */
  1535. static void releasePage(MemPage *pPage){
  1536. if( pPage ){
  1537. assert( pPage->aData );
  1538. assert( pPage->pBt );
  1539. assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  1540. assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
  1541. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1542. sqlite3PagerUnref(pPage->pDbPage);
  1543. }
  1544. }
  1545. /*
  1546. ** During a rollback, when the pager reloads information into the cache
  1547. ** so that the cache is restored to its original state at the start of
  1548. ** the transaction, for each page restored this routine is called.
  1549. **
  1550. ** This routine needs to reset the extra data section at the end of the
  1551. ** page to agree with the restored data.
  1552. */
  1553. static void pageReinit(DbPage *pData){
  1554. MemPage *pPage;
  1555. pPage = (MemPage *)sqlite3PagerGetExtra(pData);
  1556. assert( sqlite3PagerPageRefcount(pData)>0 );
  1557. if( pPage->isInit ){
  1558. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1559. pPage->isInit = 0;
  1560. if( sqlite3PagerPageRefcount(pData)>1 ){
  1561. /* pPage might not be a btree page; it might be an overflow page
  1562. ** or ptrmap page or a free page. In those cases, the following
  1563. ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
  1564. ** But no harm is done by this. And it is very important that
  1565. ** btreeInitPage() be called on every btree page so we make
  1566. ** the call for every page that comes in for re-initing. */
  1567. btreeInitPage(pPage);
  1568. }
  1569. }
  1570. }
  1571. /*
  1572. ** Invoke the busy handler for a btree.
  1573. */
  1574. static int btreeInvokeBusyHandler(void *pArg){
  1575. BtShared *pBt = (BtShared*)pArg;
  1576. assert( pBt->db );
  1577. assert( sqlite3_mutex_held(pBt->db->mutex) );
  1578. return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
  1579. }
  1580. /*
  1581. ** Open a database file.
  1582. **
  1583. ** zFilename is the name of the database file. If zFilename is NULL
  1584. ** then an ephemeral database is created. The ephemeral database might
  1585. ** be exclusively in memory, or it might use a disk-based memory cache.
  1586. ** Either way, the ephemeral database will be automatically deleted
  1587. ** when sqlite3BtreeClose() is called.
  1588. **
  1589. ** If zFilename is ":memory:" then an in-memory database is created
  1590. ** that is automatically destroyed when it is closed.
  1591. **
  1592. ** The "flags" parameter is a bitmask that might contain bits like
  1593. ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
  1594. **
  1595. ** If the database is already opened in the same database connection
  1596. ** and we are in shared cache mode, then the open will fail with an
  1597. ** SQLITE_CONSTRAINT error. We cannot allow two or more BtShared
  1598. ** objects in the same database connection since doing so will lead
  1599. ** to problems with locking.
  1600. */
  1601. int sqlite3BtreeOpen(
  1602. sqlite3_vfs *pVfs, /* VFS to use for this b-tree */
  1603. const char *zFilename, /* Name of the file containing the BTree database */
  1604. sqlite3 *db, /* Associated database handle */
  1605. Btree **ppBtree, /* Pointer to new Btree object written here */
  1606. int flags, /* Options */
  1607. int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
  1608. ){
  1609. BtShared *pBt = 0; /* Shared part of btree structure */
  1610. Btree *p; /* Handle to return */
  1611. sqlite3_mutex *mutexOpen = 0; /* Prevents a race condition. Ticket #3537 */
  1612. int rc = SQLITE_OK; /* Result code from this function */
  1613. u8 nReserve; /* Byte of unused space on each page */
  1614. unsigned char zDbHeader[100]; /* Database header content */
  1615. /* True if opening an ephemeral, temporary database */
  1616. const int isTempDb = zFilename==0 || zFilename[0]==0;
  1617. /* Set the variable isMemdb to true for an in-memory database, or
  1618. ** false for a file-based database.
  1619. */
  1620. #ifdef SQLITE_OMIT_MEMORYDB
  1621. const int isMemdb = 0;
  1622. #else
  1623. const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
  1624. || (isTempDb && sqlite3TempInMemory(db))
  1625. || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
  1626. #endif
  1627. assert( db!=0 );
  1628. assert( pVfs!=0 );
  1629. assert( sqlite3_mutex_held(db->mutex) );
  1630. assert( (flags&0xff)==flags ); /* flags fit in 8 bits */
  1631. /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
  1632. assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
  1633. /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
  1634. assert( (flags & BTREE_SINGLE)==0 || isTempDb );
  1635. if( isMemdb ){
  1636. flags |= BTREE_MEMORY;
  1637. }
  1638. if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
  1639. vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
  1640. }
  1641. p = sqlite3MallocZero(sizeof(Btree));
  1642. if( !p ){
  1643. return SQLITE_NOMEM;
  1644. }
  1645. p->inTrans = TRANS_NONE;
  1646. p->db = db;
  1647. #ifndef SQLITE_OMIT_SHARED_CACHE
  1648. p->lock.pBtree = p;
  1649. p->lock.iTable = 1;
  1650. #endif
  1651. #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1652. /*
  1653. ** If this Btree is a candidate for shared cache, try to find an
  1654. ** existing BtShared object that we can share with
  1655. */
  1656. if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
  1657. if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
  1658. int nFullPathname = pVfs->mxPathname+1;
  1659. char *zFullPathname = sqlite3Malloc(nFullPathname);
  1660. MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
  1661. p->sharable = 1;
  1662. if( !zFullPathname ){
  1663. sqlite3_free(p);
  1664. return SQLITE_NOMEM;
  1665. }
  1666. if( isMemdb ){
  1667. memcpy(zFullPathname, zFilename, sqlite3Strlen30(zFilename)+1);
  1668. }else{
  1669. rc = sqlite3OsFullPathname(pVfs, zFilename,
  1670. nFullPathname, zFullPathname);
  1671. if( rc ){
  1672. sqlite3_free(zFullPathname);
  1673. sqlite3_free(p);
  1674. return rc;
  1675. }
  1676. }
  1677. #if SQLITE_THREADSAFE
  1678. mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
  1679. sqlite3_mutex_enter(mutexOpen);
  1680. mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
  1681. sqlite3_mutex_enter(mutexShared);
  1682. #endif
  1683. for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
  1684. assert( pBt->nRef>0 );
  1685. if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
  1686. && sqlite3PagerVfs(pBt->pPager)==pVfs ){
  1687. int iDb;
  1688. for(iDb=db->nDb-1; iDb>=0; iDb--){
  1689. Btree *pExisting = db->aDb[iDb].pBt;
  1690. if( pExisting && pExisting->pBt==pBt ){
  1691. sqlite3_mutex_leave(mutexShared);
  1692. sqlite3_mutex_leave(mutexOpen);
  1693. sqlite3_free(zFullPathname);
  1694. sqlite3_free(p);
  1695. return SQLITE_CONSTRAINT;
  1696. }
  1697. }
  1698. p->pBt = pBt;
  1699. pBt->nRef++;
  1700. break;
  1701. }
  1702. }
  1703. sqlite3_mutex_leave(mutexShared);
  1704. sqlite3_free(zFullPathname);
  1705. }
  1706. #ifdef SQLITE_DEBUG
  1707. else{
  1708. /* In debug mode, we mark all persistent databases as sharable
  1709. ** even when they are not. This exercises the locking code and
  1710. ** gives more opportunity for asserts(sqlite3_mutex_held())
  1711. ** statements to find locking problems.
  1712. */
  1713. p->sharable = 1;
  1714. }
  1715. #endif
  1716. }
  1717. #endif
  1718. if( pBt==0 ){
  1719. /*
  1720. ** The following asserts make sure that structures used by the btree are
  1721. ** the right size. This is to guard against size changes that result
  1722. ** when compiling on a different architecture.
  1723. */
  1724. assert( sizeof(i64)==8 || sizeof(i64)==4 );
  1725. assert( sizeof(u64)==8 || sizeof(u64)==4 );
  1726. assert( sizeof(u32)==4 );
  1727. assert( sizeof(u16)==2 );
  1728. assert( sizeof(Pgno)==4 );
  1729. pBt = sqlite3MallocZero( sizeof(*pBt) );
  1730. if( pBt==0 ){
  1731. rc = SQLITE_NOMEM;
  1732. goto btree_open_out;
  1733. }
  1734. rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
  1735. EXTRA_SIZE, flags, vfsFlags, pageReinit);
  1736. if( rc==SQLITE_OK ){
  1737. sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
  1738. rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
  1739. }
  1740. if( rc!=SQLITE_OK ){
  1741. goto btree_open_out;
  1742. }
  1743. pBt->openFlags = (u8)flags;
  1744. pBt->db = db;
  1745. sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
  1746. p->pBt = pBt;
  1747. pBt->pCursor = 0;
  1748. pBt->pPage1 = 0;
  1749. if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
  1750. #ifdef SQLITE_SECURE_DELETE
  1751. pBt->btsFlags |= BTS_SECURE_DELETE;
  1752. #endif
  1753. pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
  1754. if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
  1755. || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
  1756. pBt->pageSize = 0;
  1757. #ifndef SQLITE_OMIT_AUTOVACUUM
  1758. /* If the magic name ":memory:" will create an in-memory database, then
  1759. ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
  1760. ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
  1761. ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
  1762. ** regular file-name. In this case the auto-vacuum applies as per normal.
  1763. */
  1764. if( zFilename && !isMemdb ){
  1765. pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
  1766. pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
  1767. }
  1768. #endif
  1769. nReserve = 0;
  1770. }else{
  1771. nReserve = zDbHeader[20];
  1772. pBt->btsFlags |= BTS_PAGESIZE_FIXED;
  1773. #ifndef SQLITE_OMIT_AUTOVACUUM
  1774. pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
  1775. pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
  1776. #endif
  1777. }
  1778. rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
  1779. if( rc ) goto btree_open_out;
  1780. pBt->usableSize = pBt->pageSize - nReserve;
  1781. assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
  1782. #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1783. /* Add the new BtShared object to the linked list sharable BtShareds.
  1784. */
  1785. if( p->sharable ){
  1786. MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
  1787. pBt->nRef = 1;
  1788. MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
  1789. if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
  1790. pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
  1791. if( pBt->mutex==0 ){
  1792. rc = SQLITE_NOMEM;
  1793. db->mallocFailed = 0;
  1794. goto btree_open_out;
  1795. }
  1796. }
  1797. sqlite3_mutex_enter(mutexShared);
  1798. pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
  1799. GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
  1800. sqlite3_mutex_leave(mutexShared);
  1801. }
  1802. #endif
  1803. }
  1804. #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1805. /* If the new Btree uses a sharable pBtShared, then link the new
  1806. ** Btree into the list of all sharable Btrees for the same connection.
  1807. ** The list is kept in ascending order by pBt address.
  1808. */
  1809. if( p->sharable ){
  1810. int i;
  1811. Btree *pSib;
  1812. for(i=0; i<db->nDb; i++){
  1813. if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
  1814. while( pSib->pPrev ){ pSib = pSib->pPrev; }
  1815. if( p->pBt<pSib->pBt ){
  1816. p->pNext = pSib;
  1817. p->pPrev = 0;
  1818. pSib->pPrev = p;
  1819. }else{
  1820. while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
  1821. pSib = pSib->pNext;
  1822. }
  1823. p->pNext = pSib->pNext;
  1824. p->pPrev = pSib;
  1825. if( p->pNext ){
  1826. p->pNext->pPrev = p;
  1827. }
  1828. pSib->pNext = p;
  1829. }
  1830. break;
  1831. }
  1832. }
  1833. }
  1834. #endif
  1835. *ppBtree = p;
  1836. btree_open_out:
  1837. if( rc!=SQLITE_OK ){
  1838. if( pBt && pBt->pPager ){
  1839. sqlite3PagerClose(pBt->pPager);
  1840. }
  1841. sqlite3_free(pBt);
  1842. sqlite3_free(p);
  1843. *ppBtree = 0;
  1844. }else{
  1845. /* If the B-Tree was successfully opened, set the pager-cache size to the
  1846. ** default value. Except, when opening on an existing shared pager-cache,
  1847. ** do not change the pager-cache size.
  1848. */
  1849. if( sqlite3BtreeSchema(p, 0, 0)==0 ){
  1850. sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
  1851. }
  1852. }
  1853. if( mutexOpen ){
  1854. assert( sqlite3_mutex_held(mutexOpen) );
  1855. sqlite3_mutex_leave(mutexOpen);
  1856. }
  1857. return rc;
  1858. }
  1859. /*
  1860. ** Decrement the BtShared.nRef counter. When it reaches zero,
  1861. ** remove the BtShared structure from the sharing list. Return
  1862. ** true if the BtShared.nRef counter reaches zero and return
  1863. ** false if it is still positive.
  1864. */
  1865. static int removeFromSharingList(BtShared *pBt){
  1866. #ifndef SQLITE_OMIT_SHARED_CACHE
  1867. MUTEX_LOGIC( sqlite3_mutex *pMaster; )
  1868. BtShared *pList;
  1869. int removed = 0;
  1870. assert( sqlite3_mutex_notheld(pBt->mutex) );
  1871. MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
  1872. sqlite3_mutex_enter(pMaster);
  1873. pBt->nRef--;
  1874. if( pBt->nRef<=0 ){
  1875. if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
  1876. GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
  1877. }else{
  1878. pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
  1879. while( ALWAYS(pList) && pList->pNext!=pBt ){
  1880. pList=pList->pNext;
  1881. }
  1882. if( ALWAYS(pList) ){
  1883. pList->pNext = pBt->pNext;
  1884. }
  1885. }
  1886. if( SQLITE_THREADSAFE ){
  1887. sqlite3_mutex_free(pBt->mutex);
  1888. }
  1889. removed = 1;
  1890. }
  1891. sqlite3_mutex_leave(pMaster);
  1892. return removed;
  1893. #else
  1894. return 1;
  1895. #endif
  1896. }
  1897. /*
  1898. ** Make sure pBt->pTmpSpace points to an allocation of
  1899. ** MX_CELL_SIZE(pBt) bytes.
  1900. */
  1901. static void allocateTempSpace(BtShared *pBt){
  1902. if( !pBt->pTmpSpace ){
  1903. pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
  1904. /* One of the uses of pBt->pTmpSpace is to format cells before
  1905. ** inserting them into a leaf page (function fillInCell()). If
  1906. ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
  1907. ** by the various routines that manipulate binary cells. Which
  1908. ** can mean that fillInCell() only initializes the first 2 or 3
  1909. ** bytes of pTmpSpace, but that the first 4 bytes are copied from
  1910. ** it into a database page. This is not actually a problem, but it
  1911. ** does cause a valgrind error when the 1 or 2 bytes of unitialized
  1912. ** data is passed to system call write(). So to avoid this error,
  1913. ** zero the first 4 bytes of temp space here. */
  1914. if( pBt->pTmpSpace ) memset(pBt->pTmpSpace, 0, 4);
  1915. }
  1916. }
  1917. /*
  1918. ** Free the pBt->pTmpSpace allocation
  1919. */
  1920. static void freeTempSpace(BtShared *pBt){
  1921. sqlite3PageFree( pBt->pTmpSpace);
  1922. pBt->pTmpSpace = 0;
  1923. }
  1924. /*
  1925. ** Close an open database and invalidate all cursors.
  1926. */
  1927. int sqlite3BtreeClose(Btree *p){
  1928. BtShared *pBt = p->pBt;
  1929. BtCursor *pCur;
  1930. /* Close all cursors opened via this handle. */
  1931. assert( sqlite3_mutex_held(p->db->mutex) );
  1932. sqlite3BtreeEnter(p);
  1933. pCur = pBt->pCursor;
  1934. while( pCur ){
  1935. BtCursor *pTmp = pCur;
  1936. pCur = pCur->pNext;
  1937. if( pTmp->pBtree==p ){
  1938. sqlite3BtreeCloseCursor(pTmp);
  1939. }
  1940. }
  1941. /* Rollback any active transaction and free the handle structure.
  1942. ** The call to sqlite3BtreeRollback() drops any table-locks held by
  1943. ** this handle.
  1944. */
  1945. sqlite3BtreeRollback(p, SQLITE_OK);
  1946. sqlite3BtreeLeave(p);
  1947. /* If there are still other outstanding references to the shared-btree
  1948. ** structure, return now. The remainder of this procedure cleans
  1949. ** up the shared-btree.
  1950. */
  1951. assert( p->wantToLock==0 && p->locked==0 );
  1952. if( !p->sharable || removeFromSharingList(pBt) ){
  1953. /* The pBt is no longer on the sharing list, so we can access
  1954. ** it without having to hold the mutex.
  1955. **
  1956. ** Clean out and delete the BtShared object.
  1957. */
  1958. assert( !pBt->pCursor );
  1959. sqlite3PagerClose(pBt->pPager);
  1960. if( pBt->xFreeSchema && pBt->pSchema ){
  1961. pBt->xFreeSchema(pBt->pSchema);
  1962. }
  1963. sqlite3DbFree(0, pBt->pSchema);
  1964. freeTempSpace(pBt);
  1965. sqlite3_free(pBt);
  1966. }
  1967. #ifndef SQLITE_OMIT_SHARED_CACHE
  1968. assert( p->wantToLock==0 );
  1969. assert( p->locked==0 );
  1970. if( p->pPrev ) p->pPrev->pNext = p->pNext;
  1971. if( p->pNext ) p->pNext->pPrev = p->pPrev;
  1972. #endif
  1973. sqlite3_free(p);
  1974. return SQLITE_OK;
  1975. }
  1976. /*
  1977. ** Change the limit on the number of pages allowed in the cache.
  1978. **
  1979. ** The maximum number of cache pages is set to the absolute
  1980. ** value of mxPage. If mxPage is negative, the pager will
  1981. ** operate asynchronously - it will not stop to do fsync()s
  1982. ** to insure data is written to the disk surface before
  1983. ** continuing. Transactions still work if synchronous is off,
  1984. ** and the database cannot be corrupted if this program
  1985. ** crashes. But if the operating system crashes or there is
  1986. ** an abrupt power failure when synchronous is off, the database
  1987. ** could be left in an inconsistent and unrecoverable state.
  1988. ** Synchronous is on by default so database corruption is not
  1989. ** normally a worry.
  1990. */
  1991. int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
  1992. BtShared *pBt = p->pBt;
  1993. assert( sqlite3_mutex_held(p->db->mutex) );
  1994. sqlite3BtreeEnter(p);
  1995. sqlite3PagerSetCachesize(pBt->pPager, mxPage);
  1996. sqlite3BtreeLeave(p);
  1997. return SQLITE_OK;
  1998. }
  1999. /*
  2000. ** Change the limit on the amount of the database file that may be
  2001. ** memory mapped.
  2002. */
  2003. int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
  2004. BtShared *pBt = p->pBt;
  2005. assert( sqlite3_mutex_held(p->db->mutex) );
  2006. sqlite3BtreeEnter(p);
  2007. sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
  2008. sqlite3BtreeLeave(p);
  2009. return SQLITE_OK;
  2010. }
  2011. /*
  2012. ** Change the way data is synced to disk in order to increase or decrease
  2013. ** how well the database resists damage due to OS crashes and power
  2014. ** failures. Level 1 is the same as asynchronous (no syncs() occur and
  2015. ** there is a high probability of damage) Level 2 is the default. There
  2016. ** is a very low but non-zero probability of damage. Level 3 reduces the
  2017. ** probability of damage to near zero but with a write performance reduction.
  2018. */
  2019. #ifndef SQLITE_OMIT_PAGER_PRAGMAS
  2020. int sqlite3BtreeSetPagerFlags(
  2021. Btree *p, /* The btree to set the safety level on */
  2022. unsigned pgFlags /* Various PAGER_* flags */
  2023. ){
  2024. BtShared *pBt = p->pBt;
  2025. assert( sqlite3_mutex_held(p->db->mutex) );
  2026. sqlite3BtreeEnter(p);
  2027. sqlite3PagerSetFlags(pBt->pPager, pgFlags);
  2028. sqlite3BtreeLeave(p);
  2029. return SQLITE_OK;
  2030. }
  2031. #endif
  2032. /*
  2033. ** Return TRUE if the given btree is set to safety level 1. In other
  2034. ** words, return TRUE if no sync() occurs on the disk files.
  2035. */
  2036. int sqlite3BtreeSyncDisabled(Btree *p){
  2037. BtShared *pBt = p->pBt;
  2038. int rc;
  2039. assert( sqlite3_mutex_held(p->db->mutex) );
  2040. sqlite3BtreeEnter(p);
  2041. assert( pBt && pBt->pPager );
  2042. rc = sqlite3PagerNosync(pBt->pPager);
  2043. sqlite3BtreeLeave(p);
  2044. return rc;
  2045. }
  2046. /*
  2047. ** Change the default pages size and the number of reserved bytes per page.
  2048. ** Or, if the page size has already been fixed, return SQLITE_READONLY
  2049. ** without changing anything.
  2050. **
  2051. ** The page size must be a power of 2 between 512 and 65536. If the page
  2052. ** size supplied does not meet this constraint then the page size is not
  2053. ** changed.
  2054. **
  2055. ** Page sizes are constrained to be a power of two so that the region
  2056. ** of the database file used for locking (beginning at PENDING_BYTE,
  2057. ** the first byte past the 1GB boundary, 0x40000000) needs to occur
  2058. ** at the beginning of a page.
  2059. **
  2060. ** If parameter nReserve is less than zero, then the number of reserved
  2061. ** bytes per page is left unchanged.
  2062. **
  2063. ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
  2064. ** and autovacuum mode can no longer be changed.
  2065. */
  2066. int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
  2067. int rc = SQLITE_OK;
  2068. BtShared *pBt = p->pBt;
  2069. assert( nReserve>=-1 && nReserve<=255 );
  2070. sqlite3BtreeEnter(p);
  2071. if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
  2072. sqlite3BtreeLeave(p);
  2073. return SQLITE_READONLY;
  2074. }
  2075. if( nReserve<0 ){
  2076. nReserve = pBt->pageSize - pBt->usableSize;
  2077. }
  2078. assert( nReserve>=0 && nReserve<=255 );
  2079. if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
  2080. ((pageSize-1)&pageSize)==0 ){
  2081. assert( (pageSize & 7)==0 );
  2082. assert( !pBt->pPage1 && !pBt->pCursor );
  2083. pBt->pageSize = (u32)pageSize;
  2084. freeTempSpace(pBt);
  2085. }
  2086. rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
  2087. pBt->usableSize = pBt->pageSize - (u16)nReserve;
  2088. if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
  2089. sqlite3BtreeLeave(p);
  2090. return rc;
  2091. }
  2092. /*
  2093. ** Return the currently defined page size
  2094. */
  2095. int sqlite3BtreeGetPageSize(Btree *p){
  2096. return p->pBt->pageSize;
  2097. }
  2098. #if defined(SQLITE_HAS_CODEC) || defined(SQLITE_DEBUG)
  2099. /*
  2100. ** This function is similar to sqlite3BtreeGetReserve(), except that it
  2101. ** may only be called if it is guaranteed that the b-tree mutex is already
  2102. ** held.
  2103. **
  2104. ** This is useful in one special case in the backup API code where it is
  2105. ** known that the shared b-tree mutex is held, but the mutex on the
  2106. ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
  2107. ** were to be called, it might collide with some other operation on the
  2108. ** database handle that owns *p, causing undefined behavior.
  2109. */
  2110. int sqlite3BtreeGetReserveNoMutex(Btree *p){
  2111. assert( sqlite3_mutex_held(p->pBt->mutex) );
  2112. return p->pBt->pageSize - p->pBt->usableSize;
  2113. }
  2114. #endif /* SQLITE_HAS_CODEC || SQLITE_DEBUG */
  2115. #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
  2116. /*
  2117. ** Return the number of bytes of space at the end of every page that
  2118. ** are intentually left unused. This is the "reserved" space that is
  2119. ** sometimes used by extensions.
  2120. */
  2121. int sqlite3BtreeGetReserve(Btree *p){
  2122. int n;
  2123. sqlite3BtreeEnter(p);
  2124. n = p->pBt->pageSize - p->pBt->usableSize;
  2125. sqlite3BtreeLeave(p);
  2126. return n;
  2127. }
  2128. /*
  2129. ** Set the maximum page count for a database if mxPage is positive.
  2130. ** No changes are made if mxPage is 0 or negative.
  2131. ** Regardless of the value of mxPage, return the maximum page count.
  2132. */
  2133. int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
  2134. int n;
  2135. sqlite3BtreeEnter(p);
  2136. n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
  2137. sqlite3BtreeLeave(p);
  2138. return n;
  2139. }
  2140. /*
  2141. ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1. If newFlag is -1,
  2142. ** then make no changes. Always return the value of the BTS_SECURE_DELETE
  2143. ** setting after the change.
  2144. */
  2145. int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
  2146. int b;
  2147. if( p==0 ) return 0;
  2148. sqlite3BtreeEnter(p);
  2149. if( newFlag>=0 ){
  2150. p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
  2151. if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
  2152. }
  2153. b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
  2154. sqlite3BtreeLeave(p);
  2155. return b;
  2156. }
  2157. #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
  2158. /*
  2159. ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
  2160. ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
  2161. ** is disabled. The default value for the auto-vacuum property is
  2162. ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
  2163. */
  2164. int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
  2165. #ifdef SQLITE_OMIT_AUTOVACUUM
  2166. return SQLITE_READONLY;
  2167. #else
  2168. BtShared *pBt = p->pBt;
  2169. int rc = SQLITE_OK;
  2170. u8 av = (u8)autoVacuum;
  2171. sqlite3BtreeEnter(p);
  2172. if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
  2173. rc = SQLITE_READONLY;
  2174. }else{
  2175. pBt->autoVacuum = av ?1:0;
  2176. pBt->incrVacuum = av==2 ?1:0;
  2177. }
  2178. sqlite3BtreeLeave(p);
  2179. return rc;
  2180. #endif
  2181. }
  2182. /*
  2183. ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
  2184. ** enabled 1 is returned. Otherwise 0.
  2185. */
  2186. int sqlite3BtreeGetAutoVacuum(Btree *p){
  2187. #ifdef SQLITE_OMIT_AUTOVACUUM
  2188. return BTREE_AUTOVACUUM_NONE;
  2189. #else
  2190. int rc;
  2191. sqlite3BtreeEnter(p);
  2192. rc = (
  2193. (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
  2194. (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
  2195. BTREE_AUTOVACUUM_INCR
  2196. );
  2197. sqlite3BtreeLeave(p);
  2198. return rc;
  2199. #endif
  2200. }
  2201. /*
  2202. ** Get a reference to pPage1 of the database file. This will
  2203. ** also acquire a readlock on that file.
  2204. **
  2205. ** SQLITE_OK is returned on success. If the file is not a
  2206. ** well-formed database file, then SQLITE_CORRUPT is returned.
  2207. ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
  2208. ** is returned if we run out of memory.
  2209. */
  2210. static int lockBtree(BtShared *pBt){
  2211. int rc; /* Result code from subfunctions */
  2212. MemPage *pPage1; /* Page 1 of the database file */
  2213. int nPage; /* Number of pages in the database */
  2214. int nPageFile = 0; /* Number of pages in the database file */
  2215. int nPageHeader; /* Number of pages in the database according to hdr */
  2216. assert( sqlite3_mutex_held(pBt->mutex) );
  2217. assert( pBt->pPage1==0 );
  2218. rc = sqlite3PagerSharedLock(pBt->pPager);
  2219. if( rc!=SQLITE_OK ) return rc;
  2220. rc = btreeGetPage(pBt, 1, &pPage1, 0);
  2221. if( rc!=SQLITE_OK ) return rc;
  2222. /* Do some checking to help insure the file we opened really is
  2223. ** a valid database file.
  2224. */
  2225. nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
  2226. sqlite3PagerPagecount(pBt->pPager, &nPageFile);
  2227. if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
  2228. nPage = nPageFile;
  2229. }
  2230. if( nPage>0 ){
  2231. u32 pageSize;
  2232. u32 usableSize;
  2233. u8 *page1 = pPage1->aData;
  2234. rc = SQLITE_NOTADB;
  2235. if( memcmp(page1, zMagicHeader, 16)!=0 ){
  2236. goto page1_init_failed;
  2237. }
  2238. #ifdef SQLITE_OMIT_WAL
  2239. if( page1[18]>1 ){
  2240. pBt->btsFlags |= BTS_READ_ONLY;
  2241. }
  2242. if( page1[19]>1 ){
  2243. goto page1_init_failed;
  2244. }
  2245. #else
  2246. if( page1[18]>2 ){
  2247. pBt->btsFlags |= BTS_READ_ONLY;
  2248. }
  2249. if( page1[19]>2 ){
  2250. goto page1_init_failed;
  2251. }
  2252. /* If the write version is set to 2, this database should be accessed
  2253. ** in WAL mode. If the log is not already open, open it now. Then
  2254. ** return SQLITE_OK and return without populating BtShared.pPage1.
  2255. ** The caller detects this and calls this function again. This is
  2256. ** required as the version of page 1 currently in the page1 buffer
  2257. ** may not be the latest version - there may be a newer one in the log
  2258. ** file.
  2259. */
  2260. if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
  2261. int isOpen = 0;
  2262. rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
  2263. if( rc!=SQLITE_OK ){
  2264. goto page1_init_failed;
  2265. }else if( isOpen==0 ){
  2266. releasePage(pPage1);
  2267. return SQLITE_OK;
  2268. }
  2269. rc = SQLITE_NOTADB;
  2270. }
  2271. #endif
  2272. /* The maximum embedded fraction must be exactly 25%. And the minimum
  2273. ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
  2274. ** The original design allowed these amounts to vary, but as of
  2275. ** version 3.6.0, we require them to be fixed.
  2276. */
  2277. if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
  2278. goto page1_init_failed;
  2279. }
  2280. pageSize = (page1[16]<<8) | (page1[17]<<16);
  2281. if( ((pageSize-1)&pageSize)!=0
  2282. || pageSize>SQLITE_MAX_PAGE_SIZE
  2283. || pageSize<=256
  2284. ){
  2285. goto page1_init_failed;
  2286. }
  2287. assert( (pageSize & 7)==0 );
  2288. usableSize = pageSize - page1[20];
  2289. if( (u32)pageSize!=pBt->pageSize ){
  2290. /* After reading the first page of the database assuming a page size
  2291. ** of BtShared.pageSize, we have discovered that the page-size is
  2292. ** actually pageSize. Unlock the database, leave pBt->pPage1 at
  2293. ** zero and return SQLITE_OK. The caller will call this function
  2294. ** again with the correct page-size.
  2295. */
  2296. releasePage(pPage1);
  2297. pBt->usableSize = usableSize;
  2298. pBt->pageSize = pageSize;
  2299. freeTempSpace(pBt);
  2300. rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
  2301. pageSize-usableSize);
  2302. return rc;
  2303. }
  2304. if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
  2305. rc = SQLITE_CORRUPT_BKPT;
  2306. goto page1_init_failed;
  2307. }
  2308. if( usableSize<480 ){
  2309. goto page1_init_failed;
  2310. }
  2311. pBt->pageSize = pageSize;
  2312. pBt->usableSize = usableSize;
  2313. #ifndef SQLITE_OMIT_AUTOVACUUM
  2314. pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
  2315. pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
  2316. #endif
  2317. }
  2318. /* maxLocal is the maximum amount of payload to store locally for
  2319. ** a cell. Make sure it is small enough so that at least minFanout
  2320. ** cells can will fit on one page. We assume a 10-byte page header.
  2321. ** Besides the payload, the cell must store:
  2322. ** 2-byte pointer to the cell
  2323. ** 4-byte child pointer
  2324. ** 9-byte nKey value
  2325. ** 4-byte nData value
  2326. ** 4-byte overflow page pointer
  2327. ** So a cell consists of a 2-byte pointer, a header which is as much as
  2328. ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
  2329. ** page pointer.
  2330. */
  2331. pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
  2332. pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
  2333. pBt->maxLeaf = (u16)(pBt->usableSize - 35);
  2334. pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
  2335. if( pBt->maxLocal>127 ){
  2336. pBt->max1bytePayload = 127;
  2337. }else{
  2338. pBt->max1bytePayload = (u8)pBt->maxLocal;
  2339. }
  2340. assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
  2341. pBt->pPage1 = pPage1;
  2342. pBt->nPage = nPage;
  2343. return SQLITE_OK;
  2344. page1_init_failed:
  2345. releasePage(pPage1);
  2346. pBt->pPage1 = 0;
  2347. return rc;
  2348. }
  2349. #ifndef NDEBUG
  2350. /*
  2351. ** Return the number of cursors open on pBt. This is for use
  2352. ** in assert() expressions, so it is only compiled if NDEBUG is not
  2353. ** defined.
  2354. **
  2355. ** Only write cursors are counted if wrOnly is true. If wrOnly is
  2356. ** false then all cursors are counted.
  2357. **
  2358. ** For the purposes of this routine, a cursor is any cursor that
  2359. ** is capable of reading or writing to the databse. Cursors that
  2360. ** have been tripped into the CURSOR_FAULT state are not counted.
  2361. */
  2362. static int countValidCursors(BtShared *pBt, int wrOnly){
  2363. BtCursor *pCur;
  2364. int r = 0;
  2365. for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
  2366. if( (wrOnly==0 || pCur->wrFlag) && pCur->eState!=CURSOR_FAULT ) r++;
  2367. }
  2368. return r;
  2369. }
  2370. #endif
  2371. /*
  2372. ** If there are no outstanding cursors and we are not in the middle
  2373. ** of a transaction but there is a read lock on the database, then
  2374. ** this routine unrefs the first page of the database file which
  2375. ** has the effect of releasing the read lock.
  2376. **
  2377. ** If there is a transaction in progress, this routine is a no-op.
  2378. */
  2379. static void unlockBtreeIfUnused(BtShared *pBt){
  2380. assert( sqlite3_mutex_held(pBt->mutex) );
  2381. assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
  2382. if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
  2383. assert( pBt->pPage1->aData );
  2384. assert( sqlite3PagerRefcount(pBt->pPager)==1 );
  2385. assert( pBt->pPage1->aData );
  2386. releasePage(pBt->pPage1);
  2387. pBt->pPage1 = 0;
  2388. }
  2389. }
  2390. /*
  2391. ** If pBt points to an empty file then convert that empty file
  2392. ** into a new empty database by initializing the first page of
  2393. ** the database.
  2394. */
  2395. static int newDatabase(BtShared *pBt){
  2396. MemPage *pP1;
  2397. unsigned char *data;
  2398. int rc;
  2399. assert( sqlite3_mutex_held(pBt->mutex) );
  2400. if( pBt->nPage>0 ){
  2401. return SQLITE_OK;
  2402. }
  2403. pP1 = pBt->pPage1;
  2404. assert( pP1!=0 );
  2405. data = pP1->aData;
  2406. rc = sqlite3PagerWrite(pP1->pDbPage);
  2407. if( rc ) return rc;
  2408. memcpy(data, zMagicHeader, sizeof(zMagicHeader));
  2409. assert( sizeof(zMagicHeader)==16 );
  2410. data[16] = (u8)((pBt->pageSize>>8)&0xff);
  2411. data[17] = (u8)((pBt->pageSize>>16)&0xff);
  2412. data[18] = 1;
  2413. data[19] = 1;
  2414. assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
  2415. data[20] = (u8)(pBt->pageSize - pBt->usableSize);
  2416. data[21] = 64;
  2417. data[22] = 32;
  2418. data[23] = 32;
  2419. memset(&data[24], 0, 100-24);
  2420. zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
  2421. pBt->btsFlags |= BTS_PAGESIZE_FIXED;
  2422. #ifndef SQLITE_OMIT_AUTOVACUUM
  2423. assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
  2424. assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
  2425. put4byte(&data[36 + 4*4], pBt->autoVacuum);
  2426. put4byte(&data[36 + 7*4], pBt->incrVacuum);
  2427. #endif
  2428. pBt->nPage = 1;
  2429. data[31] = 1;
  2430. return SQLITE_OK;
  2431. }
  2432. /*
  2433. ** Initialize the first page of the database file (creating a database
  2434. ** consisting of a single page and no schema objects). Return SQLITE_OK
  2435. ** if successful, or an SQLite error code otherwise.
  2436. */
  2437. int sqlite3BtreeNewDb(Btree *p){
  2438. int rc;
  2439. sqlite3BtreeEnter(p);
  2440. p->pBt->nPage = 0;
  2441. rc = newDatabase(p->pBt);
  2442. sqlite3BtreeLeave(p);
  2443. return rc;
  2444. }
  2445. /*
  2446. ** Attempt to start a new transaction. A write-transaction
  2447. ** is started if the second argument is nonzero, otherwise a read-
  2448. ** transaction. If the second argument is 2 or more and exclusive
  2449. ** transaction is started, meaning that no other process is allowed
  2450. ** to access the database. A preexisting transaction may not be
  2451. ** upgraded to exclusive by calling this routine a second time - the
  2452. ** exclusivity flag only works for a new transaction.
  2453. **
  2454. ** A write-transaction must be started before attempting any
  2455. ** changes to the database. None of the following routines
  2456. ** will work unless a transaction is started first:
  2457. **
  2458. ** sqlite3BtreeCreateTable()
  2459. ** sqlite3BtreeCreateIndex()
  2460. ** sqlite3BtreeClearTable()
  2461. ** sqlite3BtreeDropTable()
  2462. ** sqlite3BtreeInsert()
  2463. ** sqlite3BtreeDelete()
  2464. ** sqlite3BtreeUpdateMeta()
  2465. **
  2466. ** If an initial attempt to acquire the lock fails because of lock contention
  2467. ** and the database was previously unlocked, then invoke the busy handler
  2468. ** if there is one. But if there was previously a read-lock, do not
  2469. ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
  2470. ** returned when there is already a read-lock in order to avoid a deadlock.
  2471. **
  2472. ** Suppose there are two processes A and B. A has a read lock and B has
  2473. ** a reserved lock. B tries to promote to exclusive but is blocked because
  2474. ** of A's read lock. A tries to promote to reserved but is blocked by B.
  2475. ** One or the other of the two processes must give way or there can be
  2476. ** no progress. By returning SQLITE_BUSY and not invoking the busy callback
  2477. ** when A already has a read lock, we encourage A to give up and let B
  2478. ** proceed.
  2479. */
  2480. int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
  2481. sqlite3 *pBlock = 0;
  2482. BtShared *pBt = p->pBt;
  2483. int rc = SQLITE_OK;
  2484. sqlite3BtreeEnter(p);
  2485. btreeIntegrity(p);
  2486. /* If the btree is already in a write-transaction, or it
  2487. ** is already in a read-transaction and a read-transaction
  2488. ** is requested, this is a no-op.
  2489. */
  2490. if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
  2491. goto trans_begun;
  2492. }
  2493. assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
  2494. /* Write transactions are not possible on a read-only database */
  2495. if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
  2496. rc = SQLITE_READONLY;
  2497. goto trans_begun;
  2498. }
  2499. #ifndef SQLITE_OMIT_SHARED_CACHE
  2500. /* If another database handle has already opened a write transaction
  2501. ** on this shared-btree structure and a second write transaction is
  2502. ** requested, return SQLITE_LOCKED.
  2503. */
  2504. if( (wrflag && pBt->inTransaction==TRANS_WRITE)
  2505. || (pBt->btsFlags & BTS_PENDING)!=0
  2506. ){
  2507. pBlock = pBt->pWriter->db;
  2508. }else if( wrflag>1 ){
  2509. BtLock *pIter;
  2510. for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
  2511. if( pIter->pBtree!=p ){
  2512. pBlock = pIter->pBtree->db;
  2513. break;
  2514. }
  2515. }
  2516. }
  2517. if( pBlock ){
  2518. sqlite3ConnectionBlocked(p->db, pBlock);
  2519. rc = SQLITE_LOCKED_SHAREDCACHE;
  2520. goto trans_begun;
  2521. }
  2522. #endif
  2523. /* Any read-only or read-write transaction implies a read-lock on
  2524. ** page 1. So if some other shared-cache client already has a write-lock
  2525. ** on page 1, the transaction cannot be opened. */
  2526. rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
  2527. if( SQLITE_OK!=rc ) goto trans_begun;
  2528. pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
  2529. if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
  2530. do {
  2531. /* Call lockBtree() until either pBt->pPage1 is populated or
  2532. ** lockBtree() returns something other than SQLITE_OK. lockBtree()
  2533. ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
  2534. ** reading page 1 it discovers that the page-size of the database
  2535. ** file is not pBt->pageSize. In this case lockBtree() will update
  2536. ** pBt->pageSize to the page-size of the file on disk.
  2537. */
  2538. while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
  2539. if( rc==SQLITE_OK && wrflag ){
  2540. if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
  2541. rc = SQLITE_READONLY;
  2542. }else{
  2543. rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
  2544. if( rc==SQLITE_OK ){
  2545. rc = newDatabase(pBt);
  2546. }
  2547. }
  2548. }
  2549. if( rc!=SQLITE_OK ){
  2550. unlockBtreeIfUnused(pBt);
  2551. }
  2552. }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
  2553. btreeInvokeBusyHandler(pBt) );
  2554. if( rc==SQLITE_OK ){
  2555. if( p->inTrans==TRANS_NONE ){
  2556. pBt->nTransaction++;
  2557. #ifndef SQLITE_OMIT_SHARED_CACHE
  2558. if( p->sharable ){
  2559. assert( p->lock.pBtree==p && p->lock.iTable==1 );
  2560. p->lock.eLock = READ_LOCK;
  2561. p->lock.pNext = pBt->pLock;
  2562. pBt->pLock = &p->lock;
  2563. }
  2564. #endif
  2565. }
  2566. p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
  2567. if( p->inTrans>pBt->inTransaction ){
  2568. pBt->inTransaction = p->inTrans;
  2569. }
  2570. if( wrflag ){
  2571. MemPage *pPage1 = pBt->pPage1;
  2572. #ifndef SQLITE_OMIT_SHARED_CACHE
  2573. assert( !pBt->pWriter );
  2574. pBt->pWriter = p;
  2575. pBt->btsFlags &= ~BTS_EXCLUSIVE;
  2576. if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
  2577. #endif
  2578. /* If the db-size header field is incorrect (as it may be if an old
  2579. ** client has been writing the database file), update it now. Doing
  2580. ** this sooner rather than later means the database size can safely
  2581. ** re-read the database size from page 1 if a savepoint or transaction
  2582. ** rollback occurs within the transaction.
  2583. */
  2584. if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
  2585. rc = sqlite3PagerWrite(pPage1->pDbPage);
  2586. if( rc==SQLITE_OK ){
  2587. put4byte(&pPage1->aData[28], pBt->nPage);
  2588. }
  2589. }
  2590. }
  2591. }
  2592. trans_begun:
  2593. if( rc==SQLITE_OK && wrflag ){
  2594. /* This call makes sure that the pager has the correct number of
  2595. ** open savepoints. If the second parameter is greater than 0 and
  2596. ** the sub-journal is not already open, then it will be opened here.
  2597. */
  2598. rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
  2599. }
  2600. btreeIntegrity(p);
  2601. sqlite3BtreeLeave(p);
  2602. return rc;
  2603. }
  2604. #ifndef SQLITE_OMIT_AUTOVACUUM
  2605. /*
  2606. ** Set the pointer-map entries for all children of page pPage. Also, if
  2607. ** pPage contains cells that point to overflow pages, set the pointer
  2608. ** map entries for the overflow pages as well.
  2609. */
  2610. static int setChildPtrmaps(MemPage *pPage){
  2611. int i; /* Counter variable */
  2612. int nCell; /* Number of cells in page pPage */
  2613. int rc; /* Return code */
  2614. BtShared *pBt = pPage->pBt;
  2615. u8 isInitOrig = pPage->isInit;
  2616. Pgno pgno = pPage->pgno;
  2617. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2618. rc = btreeInitPage(pPage);
  2619. if( rc!=SQLITE_OK ){
  2620. goto set_child_ptrmaps_out;
  2621. }
  2622. nCell = pPage->nCell;
  2623. for(i=0; i<nCell; i++){
  2624. u8 *pCell = findCell(pPage, i);
  2625. ptrmapPutOvflPtr(pPage, pCell, &rc);
  2626. if( !pPage->leaf ){
  2627. Pgno childPgno = get4byte(pCell);
  2628. ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
  2629. }
  2630. }
  2631. if( !pPage->leaf ){
  2632. Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  2633. ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
  2634. }
  2635. set_child_ptrmaps_out:
  2636. pPage->isInit = isInitOrig;
  2637. return rc;
  2638. }
  2639. /*
  2640. ** Somewhere on pPage is a pointer to page iFrom. Modify this pointer so
  2641. ** that it points to iTo. Parameter eType describes the type of pointer to
  2642. ** be modified, as follows:
  2643. **
  2644. ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
  2645. ** page of pPage.
  2646. **
  2647. ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
  2648. ** page pointed to by one of the cells on pPage.
  2649. **
  2650. ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
  2651. ** overflow page in the list.
  2652. */
  2653. static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
  2654. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2655. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  2656. if( eType==PTRMAP_OVERFLOW2 ){
  2657. /* The pointer is always the first 4 bytes of the page in this case. */
  2658. if( get4byte(pPage->aData)!=iFrom ){
  2659. return SQLITE_CORRUPT_BKPT;
  2660. }
  2661. put4byte(pPage->aData, iTo);
  2662. }else{
  2663. u8 isInitOrig = pPage->isInit;
  2664. int i;
  2665. int nCell;
  2666. btreeInitPage(pPage);
  2667. nCell = pPage->nCell;
  2668. for(i=0; i<nCell; i++){
  2669. u8 *pCell = findCell(pPage, i);
  2670. if( eType==PTRMAP_OVERFLOW1 ){
  2671. CellInfo info;
  2672. btreeParseCellPtr(pPage, pCell, &info);
  2673. if( info.iOverflow
  2674. && pCell+info.iOverflow+3<=pPage->aData+pPage->maskPage
  2675. && iFrom==get4byte(&pCell[info.iOverflow])
  2676. ){
  2677. put4byte(&pCell[info.iOverflow], iTo);
  2678. break;
  2679. }
  2680. }else{
  2681. if( get4byte(pCell)==iFrom ){
  2682. put4byte(pCell, iTo);
  2683. break;
  2684. }
  2685. }
  2686. }
  2687. if( i==nCell ){
  2688. if( eType!=PTRMAP_BTREE ||
  2689. get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
  2690. return SQLITE_CORRUPT_BKPT;
  2691. }
  2692. put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
  2693. }
  2694. pPage->isInit = isInitOrig;
  2695. }
  2696. return SQLITE_OK;
  2697. }
  2698. /*
  2699. ** Move the open database page pDbPage to location iFreePage in the
  2700. ** database. The pDbPage reference remains valid.
  2701. **
  2702. ** The isCommit flag indicates that there is no need to remember that
  2703. ** the journal needs to be sync()ed before database page pDbPage->pgno
  2704. ** can be written to. The caller has already promised not to write to that
  2705. ** page.
  2706. */
  2707. static int relocatePage(
  2708. BtShared *pBt, /* Btree */
  2709. MemPage *pDbPage, /* Open page to move */
  2710. u8 eType, /* Pointer map 'type' entry for pDbPage */
  2711. Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
  2712. Pgno iFreePage, /* The location to move pDbPage to */
  2713. int isCommit /* isCommit flag passed to sqlite3PagerMovepage */
  2714. ){
  2715. MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
  2716. Pgno iDbPage = pDbPage->pgno;
  2717. Pager *pPager = pBt->pPager;
  2718. int rc;
  2719. assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
  2720. eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
  2721. assert( sqlite3_mutex_held(pBt->mutex) );
  2722. assert( pDbPage->pBt==pBt );
  2723. /* Move page iDbPage from its current location to page number iFreePage */
  2724. TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
  2725. iDbPage, iFreePage, iPtrPage, eType));
  2726. rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
  2727. if( rc!=SQLITE_OK ){
  2728. return rc;
  2729. }
  2730. pDbPage->pgno = iFreePage;
  2731. /* If pDbPage was a btree-page, then it may have child pages and/or cells
  2732. ** that point to overflow pages. The pointer map entries for all these
  2733. ** pages need to be changed.
  2734. **
  2735. ** If pDbPage is an overflow page, then the first 4 bytes may store a
  2736. ** pointer to a subsequent overflow page. If this is the case, then
  2737. ** the pointer map needs to be updated for the subsequent overflow page.
  2738. */
  2739. if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
  2740. rc = setChildPtrmaps(pDbPage);
  2741. if( rc!=SQLITE_OK ){
  2742. return rc;
  2743. }
  2744. }else{
  2745. Pgno nextOvfl = get4byte(pDbPage->aData);
  2746. if( nextOvfl!=0 ){
  2747. ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
  2748. if( rc!=SQLITE_OK ){
  2749. return rc;
  2750. }
  2751. }
  2752. }
  2753. /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
  2754. ** that it points at iFreePage. Also fix the pointer map entry for
  2755. ** iPtrPage.
  2756. */
  2757. if( eType!=PTRMAP_ROOTPAGE ){
  2758. rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
  2759. if( rc!=SQLITE_OK ){
  2760. return rc;
  2761. }
  2762. rc = sqlite3PagerWrite(pPtrPage->pDbPage);
  2763. if( rc!=SQLITE_OK ){
  2764. releasePage(pPtrPage);
  2765. return rc;
  2766. }
  2767. rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
  2768. releasePage(pPtrPage);
  2769. if( rc==SQLITE_OK ){
  2770. ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
  2771. }
  2772. }
  2773. return rc;
  2774. }
  2775. /* Forward declaration required by incrVacuumStep(). */
  2776. static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
  2777. /*
  2778. ** Perform a single step of an incremental-vacuum. If successful, return
  2779. ** SQLITE_OK. If there is no work to do (and therefore no point in
  2780. ** calling this function again), return SQLITE_DONE. Or, if an error
  2781. ** occurs, return some other error code.
  2782. **
  2783. ** More specificly, this function attempts to re-organize the database so
  2784. ** that the last page of the file currently in use is no longer in use.
  2785. **
  2786. ** Parameter nFin is the number of pages that this database would contain
  2787. ** were this function called until it returns SQLITE_DONE.
  2788. **
  2789. ** If the bCommit parameter is non-zero, this function assumes that the
  2790. ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
  2791. ** or an error. bCommit is passed true for an auto-vacuum-on-commmit
  2792. ** operation, or false for an incremental vacuum.
  2793. */
  2794. static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
  2795. Pgno nFreeList; /* Number of pages still on the free-list */
  2796. int rc;
  2797. assert( sqlite3_mutex_held(pBt->mutex) );
  2798. assert( iLastPg>nFin );
  2799. if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
  2800. u8 eType;
  2801. Pgno iPtrPage;
  2802. nFreeList = get4byte(&pBt->pPage1->aData[36]);
  2803. if( nFreeList==0 ){
  2804. return SQLITE_DONE;
  2805. }
  2806. rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
  2807. if( rc!=SQLITE_OK ){
  2808. return rc;
  2809. }
  2810. if( eType==PTRMAP_ROOTPAGE ){
  2811. return SQLITE_CORRUPT_BKPT;
  2812. }
  2813. if( eType==PTRMAP_FREEPAGE ){
  2814. if( bCommit==0 ){
  2815. /* Remove the page from the files free-list. This is not required
  2816. ** if bCommit is non-zero. In that case, the free-list will be
  2817. ** truncated to zero after this function returns, so it doesn't
  2818. ** matter if it still contains some garbage entries.
  2819. */
  2820. Pgno iFreePg;
  2821. MemPage *pFreePg;
  2822. rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
  2823. if( rc!=SQLITE_OK ){
  2824. return rc;
  2825. }
  2826. assert( iFreePg==iLastPg );
  2827. releasePage(pFreePg);
  2828. }
  2829. } else {
  2830. Pgno iFreePg; /* Index of free page to move pLastPg to */
  2831. MemPage *pLastPg;
  2832. u8 eMode = BTALLOC_ANY; /* Mode parameter for allocateBtreePage() */
  2833. Pgno iNear = 0; /* nearby parameter for allocateBtreePage() */
  2834. rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
  2835. if( rc!=SQLITE_OK ){
  2836. return rc;
  2837. }
  2838. /* If bCommit is zero, this loop runs exactly once and page pLastPg
  2839. ** is swapped with the first free page pulled off the free list.
  2840. **
  2841. ** On the other hand, if bCommit is greater than zero, then keep
  2842. ** looping until a free-page located within the first nFin pages
  2843. ** of the file is found.
  2844. */
  2845. if( bCommit==0 ){
  2846. eMode = BTALLOC_LE;
  2847. iNear = nFin;
  2848. }
  2849. do {
  2850. MemPage *pFreePg;
  2851. rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
  2852. if( rc!=SQLITE_OK ){
  2853. releasePage(pLastPg);
  2854. return rc;
  2855. }
  2856. releasePage(pFreePg);
  2857. }while( bCommit && iFreePg>nFin );
  2858. assert( iFreePg<iLastPg );
  2859. rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
  2860. releasePage(pLastPg);
  2861. if( rc!=SQLITE_OK ){
  2862. return rc;
  2863. }
  2864. }
  2865. }
  2866. if( bCommit==0 ){
  2867. do {
  2868. iLastPg--;
  2869. }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
  2870. pBt->bDoTruncate = 1;
  2871. pBt->nPage = iLastPg;
  2872. }
  2873. return SQLITE_OK;
  2874. }
  2875. /*
  2876. ** The database opened by the first argument is an auto-vacuum database
  2877. ** nOrig pages in size containing nFree free pages. Return the expected
  2878. ** size of the database in pages following an auto-vacuum operation.
  2879. */
  2880. static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
  2881. int nEntry; /* Number of entries on one ptrmap page */
  2882. Pgno nPtrmap; /* Number of PtrMap pages to be freed */
  2883. Pgno nFin; /* Return value */
  2884. nEntry = pBt->usableSize/5;
  2885. nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
  2886. nFin = nOrig - nFree - nPtrmap;
  2887. if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
  2888. nFin--;
  2889. }
  2890. while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
  2891. nFin--;
  2892. }
  2893. return nFin;
  2894. }
  2895. /*
  2896. ** A write-transaction must be opened before calling this function.
  2897. ** It performs a single unit of work towards an incremental vacuum.
  2898. **
  2899. ** If the incremental vacuum is finished after this function has run,
  2900. ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
  2901. ** SQLITE_OK is returned. Otherwise an SQLite error code.
  2902. */
  2903. int sqlite3BtreeIncrVacuum(Btree *p){
  2904. int rc;
  2905. BtShared *pBt = p->pBt;
  2906. sqlite3BtreeEnter(p);
  2907. assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
  2908. if( !pBt->autoVacuum ){
  2909. rc = SQLITE_DONE;
  2910. }else{
  2911. Pgno nOrig = btreePagecount(pBt);
  2912. Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
  2913. Pgno nFin = finalDbSize(pBt, nOrig, nFree);
  2914. if( nOrig<nFin ){
  2915. rc = SQLITE_CORRUPT_BKPT;
  2916. }else if( nFree>0 ){
  2917. rc = saveAllCursors(pBt, 0, 0);
  2918. if( rc==SQLITE_OK ){
  2919. invalidateAllOverflowCache(pBt);
  2920. rc = incrVacuumStep(pBt, nFin, nOrig, 0);
  2921. }
  2922. if( rc==SQLITE_OK ){
  2923. rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  2924. put4byte(&pBt->pPage1->aData[28], pBt->nPage);
  2925. }
  2926. }else{
  2927. rc = SQLITE_DONE;
  2928. }
  2929. }
  2930. sqlite3BtreeLeave(p);
  2931. return rc;
  2932. }
  2933. /*
  2934. ** This routine is called prior to sqlite3PagerCommit when a transaction
  2935. ** is committed for an auto-vacuum database.
  2936. **
  2937. ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
  2938. ** the database file should be truncated to during the commit process.
  2939. ** i.e. the database has been reorganized so that only the first *pnTrunc
  2940. ** pages are in use.
  2941. */
  2942. static int autoVacuumCommit(BtShared *pBt){
  2943. int rc = SQLITE_OK;
  2944. Pager *pPager = pBt->pPager;
  2945. VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
  2946. assert( sqlite3_mutex_held(pBt->mutex) );
  2947. invalidateAllOverflowCache(pBt);
  2948. assert(pBt->autoVacuum);
  2949. if( !pBt->incrVacuum ){
  2950. Pgno nFin; /* Number of pages in database after autovacuuming */
  2951. Pgno nFree; /* Number of pages on the freelist initially */
  2952. Pgno iFree; /* The next page to be freed */
  2953. Pgno nOrig; /* Database size before freeing */
  2954. nOrig = btreePagecount(pBt);
  2955. if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
  2956. /* It is not possible to create a database for which the final page
  2957. ** is either a pointer-map page or the pending-byte page. If one
  2958. ** is encountered, this indicates corruption.
  2959. */
  2960. return SQLITE_CORRUPT_BKPT;
  2961. }
  2962. nFree = get4byte(&pBt->pPage1->aData[36]);
  2963. nFin = finalDbSize(pBt, nOrig, nFree);
  2964. if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
  2965. if( nFin<nOrig ){
  2966. rc = saveAllCursors(pBt, 0, 0);
  2967. }
  2968. for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
  2969. rc = incrVacuumStep(pBt, nFin, iFree, 1);
  2970. }
  2971. if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
  2972. rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  2973. put4byte(&pBt->pPage1->aData[32], 0);
  2974. put4byte(&pBt->pPage1->aData[36], 0);
  2975. put4byte(&pBt->pPage1->aData[28], nFin);
  2976. pBt->bDoTruncate = 1;
  2977. pBt->nPage = nFin;
  2978. }
  2979. if( rc!=SQLITE_OK ){
  2980. sqlite3PagerRollback(pPager);
  2981. }
  2982. }
  2983. assert( nRef>=sqlite3PagerRefcount(pPager) );
  2984. return rc;
  2985. }
  2986. #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
  2987. # define setChildPtrmaps(x) SQLITE_OK
  2988. #endif
  2989. /*
  2990. ** This routine does the first phase of a two-phase commit. This routine
  2991. ** causes a rollback journal to be created (if it does not already exist)
  2992. ** and populated with enough information so that if a power loss occurs
  2993. ** the database can be restored to its original state by playing back
  2994. ** the journal. Then the contents of the journal are flushed out to
  2995. ** the disk. After the journal is safely on oxide, the changes to the
  2996. ** database are written into the database file and flushed to oxide.
  2997. ** At the end of this call, the rollback journal still exists on the
  2998. ** disk and we are still holding all locks, so the transaction has not
  2999. ** committed. See sqlite3BtreeCommitPhaseTwo() for the second phase of the
  3000. ** commit process.
  3001. **
  3002. ** This call is a no-op if no write-transaction is currently active on pBt.
  3003. **
  3004. ** Otherwise, sync the database file for the btree pBt. zMaster points to
  3005. ** the name of a master journal file that should be written into the
  3006. ** individual journal file, or is NULL, indicating no master journal file
  3007. ** (single database transaction).
  3008. **
  3009. ** When this is called, the master journal should already have been
  3010. ** created, populated with this journal pointer and synced to disk.
  3011. **
  3012. ** Once this is routine has returned, the only thing required to commit
  3013. ** the write-transaction for this database file is to delete the journal.
  3014. */
  3015. int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
  3016. int rc = SQLITE_OK;
  3017. if( p->inTrans==TRANS_WRITE ){
  3018. BtShared *pBt = p->pBt;
  3019. sqlite3BtreeEnter(p);
  3020. #ifndef SQLITE_OMIT_AUTOVACUUM
  3021. if( pBt->autoVacuum ){
  3022. rc = autoVacuumCommit(pBt);
  3023. if( rc!=SQLITE_OK ){
  3024. sqlite3BtreeLeave(p);
  3025. return rc;
  3026. }
  3027. }
  3028. if( pBt->bDoTruncate ){
  3029. sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
  3030. }
  3031. #endif
  3032. rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
  3033. sqlite3BtreeLeave(p);
  3034. }
  3035. return rc;
  3036. }
  3037. /*
  3038. ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
  3039. ** at the conclusion of a transaction.
  3040. */
  3041. static void btreeEndTransaction(Btree *p){
  3042. BtShared *pBt = p->pBt;
  3043. sqlite3 *db = p->db;
  3044. assert( sqlite3BtreeHoldsMutex(p) );
  3045. #ifndef SQLITE_OMIT_AUTOVACUUM
  3046. pBt->bDoTruncate = 0;
  3047. #endif
  3048. if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
  3049. /* If there are other active statements that belong to this database
  3050. ** handle, downgrade to a read-only transaction. The other statements
  3051. ** may still be reading from the database. */
  3052. downgradeAllSharedCacheTableLocks(p);
  3053. p->inTrans = TRANS_READ;
  3054. }else{
  3055. /* If the handle had any kind of transaction open, decrement the
  3056. ** transaction count of the shared btree. If the transaction count
  3057. ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
  3058. ** call below will unlock the pager. */
  3059. if( p->inTrans!=TRANS_NONE ){
  3060. clearAllSharedCacheTableLocks(p);
  3061. pBt->nTransaction--;
  3062. if( 0==pBt->nTransaction ){
  3063. pBt->inTransaction = TRANS_NONE;
  3064. }
  3065. }
  3066. /* Set the current transaction state to TRANS_NONE and unlock the
  3067. ** pager if this call closed the only read or write transaction. */
  3068. p->inTrans = TRANS_NONE;
  3069. unlockBtreeIfUnused(pBt);
  3070. }
  3071. btreeIntegrity(p);
  3072. }
  3073. /*
  3074. ** Commit the transaction currently in progress.
  3075. **
  3076. ** This routine implements the second phase of a 2-phase commit. The
  3077. ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
  3078. ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()
  3079. ** routine did all the work of writing information out to disk and flushing the
  3080. ** contents so that they are written onto the disk platter. All this
  3081. ** routine has to do is delete or truncate or zero the header in the
  3082. ** the rollback journal (which causes the transaction to commit) and
  3083. ** drop locks.
  3084. **
  3085. ** Normally, if an error occurs while the pager layer is attempting to
  3086. ** finalize the underlying journal file, this function returns an error and
  3087. ** the upper layer will attempt a rollback. However, if the second argument
  3088. ** is non-zero then this b-tree transaction is part of a multi-file
  3089. ** transaction. In this case, the transaction has already been committed
  3090. ** (by deleting a master journal file) and the caller will ignore this
  3091. ** functions return code. So, even if an error occurs in the pager layer,
  3092. ** reset the b-tree objects internal state to indicate that the write
  3093. ** transaction has been closed. This is quite safe, as the pager will have
  3094. ** transitioned to the error state.
  3095. **
  3096. ** This will release the write lock on the database file. If there
  3097. ** are no active cursors, it also releases the read lock.
  3098. */
  3099. int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
  3100. if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
  3101. sqlite3BtreeEnter(p);
  3102. btreeIntegrity(p);
  3103. /* If the handle has a write-transaction open, commit the shared-btrees
  3104. ** transaction and set the shared state to TRANS_READ.
  3105. */
  3106. if( p->inTrans==TRANS_WRITE ){
  3107. int rc;
  3108. BtShared *pBt = p->pBt;
  3109. assert( pBt->inTransaction==TRANS_WRITE );
  3110. assert( pBt->nTransaction>0 );
  3111. rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
  3112. if( rc!=SQLITE_OK && bCleanup==0 ){
  3113. sqlite3BtreeLeave(p);
  3114. return rc;
  3115. }
  3116. pBt->inTransaction = TRANS_READ;
  3117. btreeClearHasContent(pBt);
  3118. }
  3119. btreeEndTransaction(p);
  3120. sqlite3BtreeLeave(p);
  3121. return SQLITE_OK;
  3122. }
  3123. /*
  3124. ** Do both phases of a commit.
  3125. */
  3126. int sqlite3BtreeCommit(Btree *p){
  3127. int rc;
  3128. sqlite3BtreeEnter(p);
  3129. rc = sqlite3BtreeCommitPhaseOne(p, 0);
  3130. if( rc==SQLITE_OK ){
  3131. rc = sqlite3BtreeCommitPhaseTwo(p, 0);
  3132. }
  3133. sqlite3BtreeLeave(p);
  3134. return rc;
  3135. }
  3136. /*
  3137. ** This routine sets the state to CURSOR_FAULT and the error
  3138. ** code to errCode for every cursor on BtShared that pBtree
  3139. ** references.
  3140. **
  3141. ** Every cursor is tripped, including cursors that belong
  3142. ** to other database connections that happen to be sharing
  3143. ** the cache with pBtree.
  3144. **
  3145. ** This routine gets called when a rollback occurs.
  3146. ** All cursors using the same cache must be tripped
  3147. ** to prevent them from trying to use the btree after
  3148. ** the rollback. The rollback may have deleted tables
  3149. ** or moved root pages, so it is not sufficient to
  3150. ** save the state of the cursor. The cursor must be
  3151. ** invalidated.
  3152. */
  3153. void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
  3154. BtCursor *p;
  3155. if( pBtree==0 ) return;
  3156. sqlite3BtreeEnter(pBtree);
  3157. for(p=pBtree->pBt->pCursor; p; p=p->pNext){
  3158. int i;
  3159. sqlite3BtreeClearCursor(p);
  3160. p->eState = CURSOR_FAULT;
  3161. p->skipNext = errCode;
  3162. for(i=0; i<=p->iPage; i++){
  3163. releasePage(p->apPage[i]);
  3164. p->apPage[i] = 0;
  3165. }
  3166. }
  3167. sqlite3BtreeLeave(pBtree);
  3168. }
  3169. /*
  3170. ** Rollback the transaction in progress. All cursors will be
  3171. ** invalided by this operation. Any attempt to use a cursor
  3172. ** that was open at the beginning of this operation will result
  3173. ** in an error.
  3174. **
  3175. ** This will release the write lock on the database file. If there
  3176. ** are no active cursors, it also releases the read lock.
  3177. */
  3178. int sqlite3BtreeRollback(Btree *p, int tripCode){
  3179. int rc;
  3180. BtShared *pBt = p->pBt;
  3181. MemPage *pPage1;
  3182. sqlite3BtreeEnter(p);
  3183. if( tripCode==SQLITE_OK ){
  3184. rc = tripCode = saveAllCursors(pBt, 0, 0);
  3185. }else{
  3186. rc = SQLITE_OK;
  3187. }
  3188. if( tripCode ){
  3189. sqlite3BtreeTripAllCursors(p, tripCode);
  3190. }
  3191. btreeIntegrity(p);
  3192. if( p->inTrans==TRANS_WRITE ){
  3193. int rc2;
  3194. assert( TRANS_WRITE==pBt->inTransaction );
  3195. rc2 = sqlite3PagerRollback(pBt->pPager);
  3196. if( rc2!=SQLITE_OK ){
  3197. rc = rc2;
  3198. }
  3199. /* The rollback may have destroyed the pPage1->aData value. So
  3200. ** call btreeGetPage() on page 1 again to make
  3201. ** sure pPage1->aData is set correctly. */
  3202. if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
  3203. int nPage = get4byte(28+(u8*)pPage1->aData);
  3204. testcase( nPage==0 );
  3205. if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
  3206. testcase( pBt->nPage!=nPage );
  3207. pBt->nPage = nPage;
  3208. releasePage(pPage1);
  3209. }
  3210. assert( countValidCursors(pBt, 1)==0 );
  3211. pBt->inTransaction = TRANS_READ;
  3212. btreeClearHasContent(pBt);
  3213. }
  3214. btreeEndTransaction(p);
  3215. sqlite3BtreeLeave(p);
  3216. return rc;
  3217. }
  3218. /*
  3219. ** Start a statement subtransaction. The subtransaction can can be rolled
  3220. ** back independently of the main transaction. You must start a transaction
  3221. ** before starting a subtransaction. The subtransaction is ended automatically
  3222. ** if the main transaction commits or rolls back.
  3223. **
  3224. ** Statement subtransactions are used around individual SQL statements
  3225. ** that are contained within a BEGIN...COMMIT block. If a constraint
  3226. ** error occurs within the statement, the effect of that one statement
  3227. ** can be rolled back without having to rollback the entire transaction.
  3228. **
  3229. ** A statement sub-transaction is implemented as an anonymous savepoint. The
  3230. ** value passed as the second parameter is the total number of savepoints,
  3231. ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
  3232. ** are no active savepoints and no other statement-transactions open,
  3233. ** iStatement is 1. This anonymous savepoint can be released or rolled back
  3234. ** using the sqlite3BtreeSavepoint() function.
  3235. */
  3236. int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
  3237. int rc;
  3238. BtShared *pBt = p->pBt;
  3239. sqlite3BtreeEnter(p);
  3240. assert( p->inTrans==TRANS_WRITE );
  3241. assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
  3242. assert( iStatement>0 );
  3243. assert( iStatement>p->db->nSavepoint );
  3244. assert( pBt->inTransaction==TRANS_WRITE );
  3245. /* At the pager level, a statement transaction is a savepoint with
  3246. ** an index greater than all savepoints created explicitly using
  3247. ** SQL statements. It is illegal to open, release or rollback any
  3248. ** such savepoints while the statement transaction savepoint is active.
  3249. */
  3250. rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
  3251. sqlite3BtreeLeave(p);
  3252. return rc;
  3253. }
  3254. /*
  3255. ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
  3256. ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
  3257. ** savepoint identified by parameter iSavepoint, depending on the value
  3258. ** of op.
  3259. **
  3260. ** Normally, iSavepoint is greater than or equal to zero. However, if op is
  3261. ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
  3262. ** contents of the entire transaction are rolled back. This is different
  3263. ** from a normal transaction rollback, as no locks are released and the
  3264. ** transaction remains open.
  3265. */
  3266. int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
  3267. int rc = SQLITE_OK;
  3268. if( p && p->inTrans==TRANS_WRITE ){
  3269. BtShared *pBt = p->pBt;
  3270. assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
  3271. assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
  3272. sqlite3BtreeEnter(p);
  3273. rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
  3274. if( rc==SQLITE_OK ){
  3275. if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
  3276. pBt->nPage = 0;
  3277. }
  3278. rc = newDatabase(pBt);
  3279. pBt->nPage = get4byte(28 + pBt->pPage1->aData);
  3280. /* The database size was written into the offset 28 of the header
  3281. ** when the transaction started, so we know that the value at offset
  3282. ** 28 is nonzero. */
  3283. assert( pBt->nPage>0 );
  3284. }
  3285. sqlite3BtreeLeave(p);
  3286. }
  3287. return rc;
  3288. }
  3289. /*
  3290. ** Create a new cursor for the BTree whose root is on the page
  3291. ** iTable. If a read-only cursor is requested, it is assumed that
  3292. ** the caller already has at least a read-only transaction open
  3293. ** on the database already. If a write-cursor is requested, then
  3294. ** the caller is assumed to have an open write transaction.
  3295. **
  3296. ** If wrFlag==0, then the cursor can only be used for reading.
  3297. ** If wrFlag==1, then the cursor can be used for reading or for
  3298. ** writing if other conditions for writing are also met. These
  3299. ** are the conditions that must be met in order for writing to
  3300. ** be allowed:
  3301. **
  3302. ** 1: The cursor must have been opened with wrFlag==1
  3303. **
  3304. ** 2: Other database connections that share the same pager cache
  3305. ** but which are not in the READ_UNCOMMITTED state may not have
  3306. ** cursors open with wrFlag==0 on the same table. Otherwise
  3307. ** the changes made by this write cursor would be visible to
  3308. ** the read cursors in the other database connection.
  3309. **
  3310. ** 3: The database must be writable (not on read-only media)
  3311. **
  3312. ** 4: There must be an active transaction.
  3313. **
  3314. ** No checking is done to make sure that page iTable really is the
  3315. ** root page of a b-tree. If it is not, then the cursor acquired
  3316. ** will not work correctly.
  3317. **
  3318. ** It is assumed that the sqlite3BtreeCursorZero() has been called
  3319. ** on pCur to initialize the memory space prior to invoking this routine.
  3320. */
  3321. static int btreeCursor(
  3322. Btree *p, /* The btree */
  3323. int iTable, /* Root page of table to open */
  3324. int wrFlag, /* 1 to write. 0 read-only */
  3325. struct KeyInfo *pKeyInfo, /* First arg to comparison function */
  3326. BtCursor *pCur /* Space for new cursor */
  3327. ){
  3328. BtShared *pBt = p->pBt; /* Shared b-tree handle */
  3329. assert( sqlite3BtreeHoldsMutex(p) );
  3330. assert( wrFlag==0 || wrFlag==1 );
  3331. /* The following assert statements verify that if this is a sharable
  3332. ** b-tree database, the connection is holding the required table locks,
  3333. ** and that no other connection has any open cursor that conflicts with
  3334. ** this lock. */
  3335. assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
  3336. assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
  3337. /* Assert that the caller has opened the required transaction. */
  3338. assert( p->inTrans>TRANS_NONE );
  3339. assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
  3340. assert( pBt->pPage1 && pBt->pPage1->aData );
  3341. if( NEVER(wrFlag && (pBt->btsFlags & BTS_READ_ONLY)!=0) ){
  3342. return SQLITE_READONLY;
  3343. }
  3344. if( iTable==1 && btreePagecount(pBt)==0 ){
  3345. assert( wrFlag==0 );
  3346. iTable = 0;
  3347. }
  3348. /* Now that no other errors can occur, finish filling in the BtCursor
  3349. ** variables and link the cursor into the BtShared list. */
  3350. pCur->pgnoRoot = (Pgno)iTable;
  3351. pCur->iPage = -1;
  3352. pCur->pKeyInfo = pKeyInfo;
  3353. pCur->pBtree = p;
  3354. pCur->pBt = pBt;
  3355. pCur->wrFlag = (u8)wrFlag;
  3356. pCur->pNext = pBt->pCursor;
  3357. if( pCur->pNext ){
  3358. pCur->pNext->pPrev = pCur;
  3359. }
  3360. pBt->pCursor = pCur;
  3361. pCur->eState = CURSOR_INVALID;
  3362. pCur->cachedRowid = 0;
  3363. return SQLITE_OK;
  3364. }
  3365. int sqlite3BtreeCursor(
  3366. Btree *p, /* The btree */
  3367. int iTable, /* Root page of table to open */
  3368. int wrFlag, /* 1 to write. 0 read-only */
  3369. struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
  3370. BtCursor *pCur /* Write new cursor here */
  3371. ){
  3372. int rc;
  3373. sqlite3BtreeEnter(p);
  3374. rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
  3375. sqlite3BtreeLeave(p);
  3376. return rc;
  3377. }
  3378. /*
  3379. ** Return the size of a BtCursor object in bytes.
  3380. **
  3381. ** This interfaces is needed so that users of cursors can preallocate
  3382. ** sufficient storage to hold a cursor. The BtCursor object is opaque
  3383. ** to users so they cannot do the sizeof() themselves - they must call
  3384. ** this routine.
  3385. */
  3386. int sqlite3BtreeCursorSize(void){
  3387. return ROUND8(sizeof(BtCursor));
  3388. }
  3389. /*
  3390. ** Initialize memory that will be converted into a BtCursor object.
  3391. **
  3392. ** The simple approach here would be to memset() the entire object
  3393. ** to zero. But it turns out that the apPage[] and aiIdx[] arrays
  3394. ** do not need to be zeroed and they are large, so we can save a lot
  3395. ** of run-time by skipping the initialization of those elements.
  3396. */
  3397. void sqlite3BtreeCursorZero(BtCursor *p){
  3398. memset(p, 0, offsetof(BtCursor, iPage));
  3399. }
  3400. /*
  3401. ** Set the cached rowid value of every cursor in the same database file
  3402. ** as pCur and having the same root page number as pCur. The value is
  3403. ** set to iRowid.
  3404. **
  3405. ** Only positive rowid values are considered valid for this cache.
  3406. ** The cache is initialized to zero, indicating an invalid cache.
  3407. ** A btree will work fine with zero or negative rowids. We just cannot
  3408. ** cache zero or negative rowids, which means tables that use zero or
  3409. ** negative rowids might run a little slower. But in practice, zero
  3410. ** or negative rowids are very uncommon so this should not be a problem.
  3411. */
  3412. void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid){
  3413. BtCursor *p;
  3414. for(p=pCur->pBt->pCursor; p; p=p->pNext){
  3415. if( p->pgnoRoot==pCur->pgnoRoot ) p->cachedRowid = iRowid;
  3416. }
  3417. assert( pCur->cachedRowid==iRowid );
  3418. }
  3419. /*
  3420. ** Return the cached rowid for the given cursor. A negative or zero
  3421. ** return value indicates that the rowid cache is invalid and should be
  3422. ** ignored. If the rowid cache has never before been set, then a
  3423. ** zero is returned.
  3424. */
  3425. sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur){
  3426. return pCur->cachedRowid;
  3427. }
  3428. /*
  3429. ** Close a cursor. The read lock on the database file is released
  3430. ** when the last cursor is closed.
  3431. */
  3432. int sqlite3BtreeCloseCursor(BtCursor *pCur){
  3433. Btree *pBtree = pCur->pBtree;
  3434. if( pBtree ){
  3435. int i;
  3436. BtShared *pBt = pCur->pBt;
  3437. sqlite3BtreeEnter(pBtree);
  3438. sqlite3BtreeClearCursor(pCur);
  3439. if( pCur->pPrev ){
  3440. pCur->pPrev->pNext = pCur->pNext;
  3441. }else{
  3442. pBt->pCursor = pCur->pNext;
  3443. }
  3444. if( pCur->pNext ){
  3445. pCur->pNext->pPrev = pCur->pPrev;
  3446. }
  3447. for(i=0; i<=pCur->iPage; i++){
  3448. releasePage(pCur->apPage[i]);
  3449. }
  3450. unlockBtreeIfUnused(pBt);
  3451. invalidateOverflowCache(pCur);
  3452. /* sqlite3_free(pCur); */
  3453. sqlite3BtreeLeave(pBtree);
  3454. }
  3455. return SQLITE_OK;
  3456. }
  3457. /*
  3458. ** Make sure the BtCursor* given in the argument has a valid
  3459. ** BtCursor.info structure. If it is not already valid, call
  3460. ** btreeParseCell() to fill it in.
  3461. **
  3462. ** BtCursor.info is a cache of the information in the current cell.
  3463. ** Using this cache reduces the number of calls to btreeParseCell().
  3464. **
  3465. ** 2007-06-25: There is a bug in some versions of MSVC that cause the
  3466. ** compiler to crash when getCellInfo() is implemented as a macro.
  3467. ** But there is a measureable speed advantage to using the macro on gcc
  3468. ** (when less compiler optimizations like -Os or -O0 are used and the
  3469. ** compiler is not doing agressive inlining.) So we use a real function
  3470. ** for MSVC and a macro for everything else. Ticket #2457.
  3471. */
  3472. #ifndef NDEBUG
  3473. static void assertCellInfo(BtCursor *pCur){
  3474. CellInfo info;
  3475. int iPage = pCur->iPage;
  3476. memset(&info, 0, sizeof(info));
  3477. btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
  3478. assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
  3479. }
  3480. #else
  3481. #define assertCellInfo(x)
  3482. #endif
  3483. #ifdef _MSC_VER
  3484. /* Use a real function in MSVC to work around bugs in that compiler. */
  3485. static void getCellInfo(BtCursor *pCur){
  3486. if( pCur->info.nSize==0 ){
  3487. int iPage = pCur->iPage;
  3488. btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
  3489. pCur->validNKey = 1;
  3490. }else{
  3491. assertCellInfo(pCur);
  3492. }
  3493. }
  3494. #else /* if not _MSC_VER */
  3495. /* Use a macro in all other compilers so that the function is inlined */
  3496. #define getCellInfo(pCur) \
  3497. if( pCur->info.nSize==0 ){ \
  3498. int iPage = pCur->iPage; \
  3499. btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
  3500. pCur->validNKey = 1; \
  3501. }else{ \
  3502. assertCellInfo(pCur); \
  3503. }
  3504. #endif /* _MSC_VER */
  3505. #ifndef NDEBUG /* The next routine used only within assert() statements */
  3506. /*
  3507. ** Return true if the given BtCursor is valid. A valid cursor is one
  3508. ** that is currently pointing to a row in a (non-empty) table.
  3509. ** This is a verification routine is used only within assert() statements.
  3510. */
  3511. int sqlite3BtreeCursorIsValid(BtCursor *pCur){
  3512. return pCur && pCur->eState==CURSOR_VALID;
  3513. }
  3514. #endif /* NDEBUG */
  3515. /*
  3516. ** Set *pSize to the size of the buffer needed to hold the value of
  3517. ** the key for the current entry. If the cursor is not pointing
  3518. ** to a valid entry, *pSize is set to 0.
  3519. **
  3520. ** For a table with the INTKEY flag set, this routine returns the key
  3521. ** itself, not the number of bytes in the key.
  3522. **
  3523. ** The caller must position the cursor prior to invoking this routine.
  3524. **
  3525. ** This routine cannot fail. It always returns SQLITE_OK.
  3526. */
  3527. int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
  3528. assert( cursorHoldsMutex(pCur) );
  3529. assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
  3530. if( pCur->eState!=CURSOR_VALID ){
  3531. *pSize = 0;
  3532. }else{
  3533. getCellInfo(pCur);
  3534. *pSize = pCur->info.nKey;
  3535. }
  3536. return SQLITE_OK;
  3537. }
  3538. /*
  3539. ** Set *pSize to the number of bytes of data in the entry the
  3540. ** cursor currently points to.
  3541. **
  3542. ** The caller must guarantee that the cursor is pointing to a non-NULL
  3543. ** valid entry. In other words, the calling procedure must guarantee
  3544. ** that the cursor has Cursor.eState==CURSOR_VALID.
  3545. **
  3546. ** Failure is not possible. This function always returns SQLITE_OK.
  3547. ** It might just as well be a procedure (returning void) but we continue
  3548. ** to return an integer result code for historical reasons.
  3549. */
  3550. int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
  3551. assert( cursorHoldsMutex(pCur) );
  3552. assert( pCur->eState==CURSOR_VALID );
  3553. getCellInfo(pCur);
  3554. *pSize = pCur->info.nData;
  3555. return SQLITE_OK;
  3556. }
  3557. /*
  3558. ** Given the page number of an overflow page in the database (parameter
  3559. ** ovfl), this function finds the page number of the next page in the
  3560. ** linked list of overflow pages. If possible, it uses the auto-vacuum
  3561. ** pointer-map data instead of reading the content of page ovfl to do so.
  3562. **
  3563. ** If an error occurs an SQLite error code is returned. Otherwise:
  3564. **
  3565. ** The page number of the next overflow page in the linked list is
  3566. ** written to *pPgnoNext. If page ovfl is the last page in its linked
  3567. ** list, *pPgnoNext is set to zero.
  3568. **
  3569. ** If ppPage is not NULL, and a reference to the MemPage object corresponding
  3570. ** to page number pOvfl was obtained, then *ppPage is set to point to that
  3571. ** reference. It is the responsibility of the caller to call releasePage()
  3572. ** on *ppPage to free the reference. In no reference was obtained (because
  3573. ** the pointer-map was used to obtain the value for *pPgnoNext), then
  3574. ** *ppPage is set to zero.
  3575. */
  3576. static int getOverflowPage(
  3577. BtShared *pBt, /* The database file */
  3578. Pgno ovfl, /* Current overflow page number */
  3579. MemPage **ppPage, /* OUT: MemPage handle (may be NULL) */
  3580. Pgno *pPgnoNext /* OUT: Next overflow page number */
  3581. ){
  3582. Pgno next = 0;
  3583. MemPage *pPage = 0;
  3584. int rc = SQLITE_OK;
  3585. assert( sqlite3_mutex_held(pBt->mutex) );
  3586. assert(pPgnoNext);
  3587. #ifndef SQLITE_OMIT_AUTOVACUUM
  3588. /* Try to find the next page in the overflow list using the
  3589. ** autovacuum pointer-map pages. Guess that the next page in
  3590. ** the overflow list is page number (ovfl+1). If that guess turns
  3591. ** out to be wrong, fall back to loading the data of page
  3592. ** number ovfl to determine the next page number.
  3593. */
  3594. if( pBt->autoVacuum ){
  3595. Pgno pgno;
  3596. Pgno iGuess = ovfl+1;
  3597. u8 eType;
  3598. while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
  3599. iGuess++;
  3600. }
  3601. if( iGuess<=btreePagecount(pBt) ){
  3602. rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
  3603. if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
  3604. next = iGuess;
  3605. rc = SQLITE_DONE;
  3606. }
  3607. }
  3608. }
  3609. #endif
  3610. assert( next==0 || rc==SQLITE_DONE );
  3611. if( rc==SQLITE_OK ){
  3612. rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
  3613. assert( rc==SQLITE_OK || pPage==0 );
  3614. if( rc==SQLITE_OK ){
  3615. next = get4byte(pPage->aData);
  3616. }
  3617. }
  3618. *pPgnoNext = next;
  3619. if( ppPage ){
  3620. *ppPage = pPage;
  3621. }else{
  3622. releasePage(pPage);
  3623. }
  3624. return (rc==SQLITE_DONE ? SQLITE_OK : rc);
  3625. }
  3626. /*
  3627. ** Copy data from a buffer to a page, or from a page to a buffer.
  3628. **
  3629. ** pPayload is a pointer to data stored on database page pDbPage.
  3630. ** If argument eOp is false, then nByte bytes of data are copied
  3631. ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
  3632. ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
  3633. ** of data are copied from the buffer pBuf to pPayload.
  3634. **
  3635. ** SQLITE_OK is returned on success, otherwise an error code.
  3636. */
  3637. static int copyPayload(
  3638. void *pPayload, /* Pointer to page data */
  3639. void *pBuf, /* Pointer to buffer */
  3640. int nByte, /* Number of bytes to copy */
  3641. int eOp, /* 0 -> copy from page, 1 -> copy to page */
  3642. DbPage *pDbPage /* Page containing pPayload */
  3643. ){
  3644. if( eOp ){
  3645. /* Copy data from buffer to page (a write operation) */
  3646. int rc = sqlite3PagerWrite(pDbPage);
  3647. if( rc!=SQLITE_OK ){
  3648. return rc;
  3649. }
  3650. memcpy(pPayload, pBuf, nByte);
  3651. }else{
  3652. /* Copy data from page to buffer (a read operation) */
  3653. memcpy(pBuf, pPayload, nByte);
  3654. }
  3655. return SQLITE_OK;
  3656. }
  3657. /*
  3658. ** This function is used to read or overwrite payload information
  3659. ** for the entry that the pCur cursor is pointing to. If the eOp
  3660. ** parameter is 0, this is a read operation (data copied into
  3661. ** buffer pBuf). If it is non-zero, a write (data copied from
  3662. ** buffer pBuf).
  3663. **
  3664. ** A total of "amt" bytes are read or written beginning at "offset".
  3665. ** Data is read to or from the buffer pBuf.
  3666. **
  3667. ** The content being read or written might appear on the main page
  3668. ** or be scattered out on multiple overflow pages.
  3669. **
  3670. ** If the BtCursor.isIncrblobHandle flag is set, and the current
  3671. ** cursor entry uses one or more overflow pages, this function
  3672. ** allocates space for and lazily popluates the overflow page-list
  3673. ** cache array (BtCursor.aOverflow). Subsequent calls use this
  3674. ** cache to make seeking to the supplied offset more efficient.
  3675. **
  3676. ** Once an overflow page-list cache has been allocated, it may be
  3677. ** invalidated if some other cursor writes to the same table, or if
  3678. ** the cursor is moved to a different row. Additionally, in auto-vacuum
  3679. ** mode, the following events may invalidate an overflow page-list cache.
  3680. **
  3681. ** * An incremental vacuum,
  3682. ** * A commit in auto_vacuum="full" mode,
  3683. ** * Creating a table (may require moving an overflow page).
  3684. */
  3685. static int accessPayload(
  3686. BtCursor *pCur, /* Cursor pointing to entry to read from */
  3687. u32 offset, /* Begin reading this far into payload */
  3688. u32 amt, /* Read this many bytes */
  3689. unsigned char *pBuf, /* Write the bytes into this buffer */
  3690. int eOp /* zero to read. non-zero to write. */
  3691. ){
  3692. unsigned char *aPayload;
  3693. int rc = SQLITE_OK;
  3694. u32 nKey;
  3695. int iIdx = 0;
  3696. MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
  3697. BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
  3698. assert( pPage );
  3699. assert( pCur->eState==CURSOR_VALID );
  3700. assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  3701. assert( cursorHoldsMutex(pCur) );
  3702. getCellInfo(pCur);
  3703. aPayload = pCur->info.pCell + pCur->info.nHeader;
  3704. nKey = (pPage->intKey ? 0 : (int)pCur->info.nKey);
  3705. if( NEVER(offset+amt > nKey+pCur->info.nData)
  3706. || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
  3707. ){
  3708. /* Trying to read or write past the end of the data is an error */
  3709. return SQLITE_CORRUPT_BKPT;
  3710. }
  3711. /* Check if data must be read/written to/from the btree page itself. */
  3712. if( offset<pCur->info.nLocal ){
  3713. int a = amt;
  3714. if( a+offset>pCur->info.nLocal ){
  3715. a = pCur->info.nLocal - offset;
  3716. }
  3717. rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
  3718. offset = 0;
  3719. pBuf += a;
  3720. amt -= a;
  3721. }else{
  3722. offset -= pCur->info.nLocal;
  3723. }
  3724. if( rc==SQLITE_OK && amt>0 ){
  3725. const u32 ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
  3726. Pgno nextPage;
  3727. nextPage = get4byte(&aPayload[pCur->info.nLocal]);
  3728. #ifndef SQLITE_OMIT_INCRBLOB
  3729. /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
  3730. ** has not been allocated, allocate it now. The array is sized at
  3731. ** one entry for each overflow page in the overflow chain. The
  3732. ** page number of the first overflow page is stored in aOverflow[0],
  3733. ** etc. A value of 0 in the aOverflow[] array means "not yet known"
  3734. ** (the cache is lazily populated).
  3735. */
  3736. if( pCur->isIncrblobHandle && !pCur->aOverflow ){
  3737. int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
  3738. pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
  3739. /* nOvfl is always positive. If it were zero, fetchPayload would have
  3740. ** been used instead of this routine. */
  3741. if( ALWAYS(nOvfl) && !pCur->aOverflow ){
  3742. rc = SQLITE_NOMEM;
  3743. }
  3744. }
  3745. /* If the overflow page-list cache has been allocated and the
  3746. ** entry for the first required overflow page is valid, skip
  3747. ** directly to it.
  3748. */
  3749. if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
  3750. iIdx = (offset/ovflSize);
  3751. nextPage = pCur->aOverflow[iIdx];
  3752. offset = (offset%ovflSize);
  3753. }
  3754. #endif
  3755. for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
  3756. #ifndef SQLITE_OMIT_INCRBLOB
  3757. /* If required, populate the overflow page-list cache. */
  3758. if( pCur->aOverflow ){
  3759. assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
  3760. pCur->aOverflow[iIdx] = nextPage;
  3761. }
  3762. #endif
  3763. if( offset>=ovflSize ){
  3764. /* The only reason to read this page is to obtain the page
  3765. ** number for the next page in the overflow chain. The page
  3766. ** data is not required. So first try to lookup the overflow
  3767. ** page-list cache, if any, then fall back to the getOverflowPage()
  3768. ** function.
  3769. */
  3770. #ifndef SQLITE_OMIT_INCRBLOB
  3771. if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
  3772. nextPage = pCur->aOverflow[iIdx+1];
  3773. } else
  3774. #endif
  3775. rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
  3776. offset -= ovflSize;
  3777. }else{
  3778. /* Need to read this page properly. It contains some of the
  3779. ** range of data that is being read (eOp==0) or written (eOp!=0).
  3780. */
  3781. #ifdef SQLITE_DIRECT_OVERFLOW_READ
  3782. sqlite3_file *fd;
  3783. #endif
  3784. int a = amt;
  3785. if( a + offset > ovflSize ){
  3786. a = ovflSize - offset;
  3787. }
  3788. #ifdef SQLITE_DIRECT_OVERFLOW_READ
  3789. /* If all the following are true:
  3790. **
  3791. ** 1) this is a read operation, and
  3792. ** 2) data is required from the start of this overflow page, and
  3793. ** 3) the database is file-backed, and
  3794. ** 4) there is no open write-transaction, and
  3795. ** 5) the database is not a WAL database,
  3796. **
  3797. ** then data can be read directly from the database file into the
  3798. ** output buffer, bypassing the page-cache altogether. This speeds
  3799. ** up loading large records that span many overflow pages.
  3800. */
  3801. if( eOp==0 /* (1) */
  3802. && offset==0 /* (2) */
  3803. && pBt->inTransaction==TRANS_READ /* (4) */
  3804. && (fd = sqlite3PagerFile(pBt->pPager))->pMethods /* (3) */
  3805. && pBt->pPage1->aData[19]==0x01 /* (5) */
  3806. ){
  3807. u8 aSave[4];
  3808. u8 *aWrite = &pBuf[-4];
  3809. memcpy(aSave, aWrite, 4);
  3810. rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
  3811. nextPage = get4byte(aWrite);
  3812. memcpy(aWrite, aSave, 4);
  3813. }else
  3814. #endif
  3815. {
  3816. DbPage *pDbPage;
  3817. rc = sqlite3PagerAcquire(pBt->pPager, nextPage, &pDbPage,
  3818. (eOp==0 ? PAGER_GET_READONLY : 0)
  3819. );
  3820. if( rc==SQLITE_OK ){
  3821. aPayload = sqlite3PagerGetData(pDbPage);
  3822. nextPage = get4byte(aPayload);
  3823. rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
  3824. sqlite3PagerUnref(pDbPage);
  3825. offset = 0;
  3826. }
  3827. }
  3828. amt -= a;
  3829. pBuf += a;
  3830. }
  3831. }
  3832. }
  3833. if( rc==SQLITE_OK && amt>0 ){
  3834. return SQLITE_CORRUPT_BKPT;
  3835. }
  3836. return rc;
  3837. }
  3838. /*
  3839. ** Read part of the key associated with cursor pCur. Exactly
  3840. ** "amt" bytes will be transfered into pBuf[]. The transfer
  3841. ** begins at "offset".
  3842. **
  3843. ** The caller must ensure that pCur is pointing to a valid row
  3844. ** in the table.
  3845. **
  3846. ** Return SQLITE_OK on success or an error code if anything goes
  3847. ** wrong. An error is returned if "offset+amt" is larger than
  3848. ** the available payload.
  3849. */
  3850. int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
  3851. assert( cursorHoldsMutex(pCur) );
  3852. assert( pCur->eState==CURSOR_VALID );
  3853. assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
  3854. assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
  3855. return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
  3856. }
  3857. /*
  3858. ** Read part of the data associated with cursor pCur. Exactly
  3859. ** "amt" bytes will be transfered into pBuf[]. The transfer
  3860. ** begins at "offset".
  3861. **
  3862. ** Return SQLITE_OK on success or an error code if anything goes
  3863. ** wrong. An error is returned if "offset+amt" is larger than
  3864. ** the available payload.
  3865. */
  3866. int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
  3867. int rc;
  3868. #ifndef SQLITE_OMIT_INCRBLOB
  3869. if ( pCur->eState==CURSOR_INVALID ){
  3870. return SQLITE_ABORT;
  3871. }
  3872. #endif
  3873. assert( cursorHoldsMutex(pCur) );
  3874. rc = restoreCursorPosition(pCur);
  3875. if( rc==SQLITE_OK ){
  3876. assert( pCur->eState==CURSOR_VALID );
  3877. assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
  3878. assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
  3879. rc = accessPayload(pCur, offset, amt, pBuf, 0);
  3880. }
  3881. return rc;
  3882. }
  3883. /*
  3884. ** Return a pointer to payload information from the entry that the
  3885. ** pCur cursor is pointing to. The pointer is to the beginning of
  3886. ** the key if skipKey==0 and it points to the beginning of data if
  3887. ** skipKey==1. The number of bytes of available key/data is written
  3888. ** into *pAmt. If *pAmt==0, then the value returned will not be
  3889. ** a valid pointer.
  3890. **
  3891. ** This routine is an optimization. It is common for the entire key
  3892. ** and data to fit on the local page and for there to be no overflow
  3893. ** pages. When that is so, this routine can be used to access the
  3894. ** key and data without making a copy. If the key and/or data spills
  3895. ** onto overflow pages, then accessPayload() must be used to reassemble
  3896. ** the key/data and copy it into a preallocated buffer.
  3897. **
  3898. ** The pointer returned by this routine looks directly into the cached
  3899. ** page of the database. The data might change or move the next time
  3900. ** any btree routine is called.
  3901. */
  3902. static const unsigned char *fetchPayload(
  3903. BtCursor *pCur, /* Cursor pointing to entry to read from */
  3904. int *pAmt, /* Write the number of available bytes here */
  3905. int skipKey /* read beginning at data if this is true */
  3906. ){
  3907. unsigned char *aPayload;
  3908. MemPage *pPage;
  3909. u32 nKey;
  3910. u32 nLocal;
  3911. assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
  3912. assert( pCur->eState==CURSOR_VALID );
  3913. assert( cursorHoldsMutex(pCur) );
  3914. pPage = pCur->apPage[pCur->iPage];
  3915. assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  3916. if( NEVER(pCur->info.nSize==0) ){
  3917. btreeParseCell(pCur->apPage[pCur->iPage], pCur->aiIdx[pCur->iPage],
  3918. &pCur->info);
  3919. }
  3920. aPayload = pCur->info.pCell;
  3921. aPayload += pCur->info.nHeader;
  3922. if( pPage->intKey ){
  3923. nKey = 0;
  3924. }else{
  3925. nKey = (int)pCur->info.nKey;
  3926. }
  3927. if( skipKey ){
  3928. aPayload += nKey;
  3929. nLocal = pCur->info.nLocal - nKey;
  3930. }else{
  3931. nLocal = pCur->info.nLocal;
  3932. assert( nLocal<=nKey );
  3933. }
  3934. *pAmt = nLocal;
  3935. return aPayload;
  3936. }
  3937. /*
  3938. ** For the entry that cursor pCur is point to, return as
  3939. ** many bytes of the key or data as are available on the local
  3940. ** b-tree page. Write the number of available bytes into *pAmt.
  3941. **
  3942. ** The pointer returned is ephemeral. The key/data may move
  3943. ** or be destroyed on the next call to any Btree routine,
  3944. ** including calls from other threads against the same cache.
  3945. ** Hence, a mutex on the BtShared should be held prior to calling
  3946. ** this routine.
  3947. **
  3948. ** These routines is used to get quick access to key and data
  3949. ** in the common case where no overflow pages are used.
  3950. */
  3951. const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
  3952. const void *p = 0;
  3953. assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3954. assert( cursorHoldsMutex(pCur) );
  3955. if( ALWAYS(pCur->eState==CURSOR_VALID) ){
  3956. p = (const void*)fetchPayload(pCur, pAmt, 0);
  3957. }
  3958. return p;
  3959. }
  3960. const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
  3961. const void *p = 0;
  3962. assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3963. assert( cursorHoldsMutex(pCur) );
  3964. if( ALWAYS(pCur->eState==CURSOR_VALID) ){
  3965. p = (const void*)fetchPayload(pCur, pAmt, 1);
  3966. }
  3967. return p;
  3968. }
  3969. /*
  3970. ** Move the cursor down to a new child page. The newPgno argument is the
  3971. ** page number of the child page to move to.
  3972. **
  3973. ** This function returns SQLITE_CORRUPT if the page-header flags field of
  3974. ** the new child page does not match the flags field of the parent (i.e.
  3975. ** if an intkey page appears to be the parent of a non-intkey page, or
  3976. ** vice-versa).
  3977. */
  3978. static int moveToChild(BtCursor *pCur, u32 newPgno){
  3979. int rc;
  3980. int i = pCur->iPage;
  3981. MemPage *pNewPage;
  3982. BtShared *pBt = pCur->pBt;
  3983. assert( cursorHoldsMutex(pCur) );
  3984. assert( pCur->eState==CURSOR_VALID );
  3985. assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
  3986. assert( pCur->iPage>=0 );
  3987. if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
  3988. return SQLITE_CORRUPT_BKPT;
  3989. }
  3990. rc = getAndInitPage(pBt, newPgno, &pNewPage,
  3991. pCur->wrFlag==0 ? PAGER_GET_READONLY : 0);
  3992. if( rc ) return rc;
  3993. pCur->apPage[i+1] = pNewPage;
  3994. pCur->aiIdx[i+1] = 0;
  3995. pCur->iPage++;
  3996. pCur->info.nSize = 0;
  3997. pCur->validNKey = 0;
  3998. if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){
  3999. return SQLITE_CORRUPT_BKPT;
  4000. }
  4001. return SQLITE_OK;
  4002. }
  4003. #if 0
  4004. /*
  4005. ** Page pParent is an internal (non-leaf) tree page. This function
  4006. ** asserts that page number iChild is the left-child if the iIdx'th
  4007. ** cell in page pParent. Or, if iIdx is equal to the total number of
  4008. ** cells in pParent, that page number iChild is the right-child of
  4009. ** the page.
  4010. */
  4011. static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
  4012. assert( iIdx<=pParent->nCell );
  4013. if( iIdx==pParent->nCell ){
  4014. assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
  4015. }else{
  4016. assert( get4byte(findCell(pParent, iIdx))==iChild );
  4017. }
  4018. }
  4019. #else
  4020. # define assertParentIndex(x,y,z)
  4021. #endif
  4022. /*
  4023. ** Move the cursor up to the parent page.
  4024. **
  4025. ** pCur->idx is set to the cell index that contains the pointer
  4026. ** to the page we are coming from. If we are coming from the
  4027. ** right-most child page then pCur->idx is set to one more than
  4028. ** the largest cell index.
  4029. */
  4030. static void moveToParent(BtCursor *pCur){
  4031. assert( cursorHoldsMutex(pCur) );
  4032. assert( pCur->eState==CURSOR_VALID );
  4033. assert( pCur->iPage>0 );
  4034. assert( pCur->apPage[pCur->iPage] );
  4035. /* UPDATE: It is actually possible for the condition tested by the assert
  4036. ** below to be untrue if the database file is corrupt. This can occur if
  4037. ** one cursor has modified page pParent while a reference to it is held
  4038. ** by a second cursor. Which can only happen if a single page is linked
  4039. ** into more than one b-tree structure in a corrupt database. */
  4040. #if 0
  4041. assertParentIndex(
  4042. pCur->apPage[pCur->iPage-1],
  4043. pCur->aiIdx[pCur->iPage-1],
  4044. pCur->apPage[pCur->iPage]->pgno
  4045. );
  4046. #endif
  4047. testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
  4048. releasePage(pCur->apPage[pCur->iPage]);
  4049. pCur->iPage--;
  4050. pCur->info.nSize = 0;
  4051. pCur->validNKey = 0;
  4052. }
  4053. /*
  4054. ** Move the cursor to point to the root page of its b-tree structure.
  4055. **
  4056. ** If the table has a virtual root page, then the cursor is moved to point
  4057. ** to the virtual root page instead of the actual root page. A table has a
  4058. ** virtual root page when the actual root page contains no cells and a
  4059. ** single child page. This can only happen with the table rooted at page 1.
  4060. **
  4061. ** If the b-tree structure is empty, the cursor state is set to
  4062. ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first
  4063. ** cell located on the root (or virtual root) page and the cursor state
  4064. ** is set to CURSOR_VALID.
  4065. **
  4066. ** If this function returns successfully, it may be assumed that the
  4067. ** page-header flags indicate that the [virtual] root-page is the expected
  4068. ** kind of b-tree page (i.e. if when opening the cursor the caller did not
  4069. ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
  4070. ** indicating a table b-tree, or if the caller did specify a KeyInfo
  4071. ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
  4072. ** b-tree).
  4073. */
  4074. static int moveToRoot(BtCursor *pCur){
  4075. MemPage *pRoot;
  4076. int rc = SQLITE_OK;
  4077. Btree *p = pCur->pBtree;
  4078. BtShared *pBt = p->pBt;
  4079. assert( cursorHoldsMutex(pCur) );
  4080. assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
  4081. assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
  4082. assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
  4083. if( pCur->eState>=CURSOR_REQUIRESEEK ){
  4084. if( pCur->eState==CURSOR_FAULT ){
  4085. assert( pCur->skipNext!=SQLITE_OK );
  4086. return pCur->skipNext;
  4087. }
  4088. sqlite3BtreeClearCursor(pCur);
  4089. }
  4090. if( pCur->iPage>=0 ){
  4091. int i;
  4092. for(i=1; i<=pCur->iPage; i++){
  4093. releasePage(pCur->apPage[i]);
  4094. }
  4095. pCur->iPage = 0;
  4096. }else if( pCur->pgnoRoot==0 ){
  4097. pCur->eState = CURSOR_INVALID;
  4098. return SQLITE_OK;
  4099. }else{
  4100. rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0],
  4101. pCur->wrFlag==0 ? PAGER_GET_READONLY : 0);
  4102. if( rc!=SQLITE_OK ){
  4103. pCur->eState = CURSOR_INVALID;
  4104. return rc;
  4105. }
  4106. pCur->iPage = 0;
  4107. /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
  4108. ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
  4109. ** NULL, the caller expects a table b-tree. If this is not the case,
  4110. ** return an SQLITE_CORRUPT error. */
  4111. assert( pCur->apPage[0]->intKey==1 || pCur->apPage[0]->intKey==0 );
  4112. if( (pCur->pKeyInfo==0)!=pCur->apPage[0]->intKey ){
  4113. return SQLITE_CORRUPT_BKPT;
  4114. }
  4115. }
  4116. /* Assert that the root page is of the correct type. This must be the
  4117. ** case as the call to this function that loaded the root-page (either
  4118. ** this call or a previous invocation) would have detected corruption
  4119. ** if the assumption were not true, and it is not possible for the flags
  4120. ** byte to have been modified while this cursor is holding a reference
  4121. ** to the page. */
  4122. pRoot = pCur->apPage[0];
  4123. assert( pRoot->pgno==pCur->pgnoRoot );
  4124. assert( pRoot->isInit && (pCur->pKeyInfo==0)==pRoot->intKey );
  4125. pCur->aiIdx[0] = 0;
  4126. pCur->info.nSize = 0;
  4127. pCur->atLast = 0;
  4128. pCur->validNKey = 0;
  4129. if( pRoot->nCell==0 && !pRoot->leaf ){
  4130. Pgno subpage;
  4131. if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
  4132. subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
  4133. pCur->eState = CURSOR_VALID;
  4134. rc = moveToChild(pCur, subpage);
  4135. }else{
  4136. pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
  4137. }
  4138. return rc;
  4139. }
  4140. /*
  4141. ** Move the cursor down to the left-most leaf entry beneath the
  4142. ** entry to which it is currently pointing.
  4143. **
  4144. ** The left-most leaf is the one with the smallest key - the first
  4145. ** in ascending order.
  4146. */
  4147. static int moveToLeftmost(BtCursor *pCur){
  4148. Pgno pgno;
  4149. int rc = SQLITE_OK;
  4150. MemPage *pPage;
  4151. assert( cursorHoldsMutex(pCur) );
  4152. assert( pCur->eState==CURSOR_VALID );
  4153. while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
  4154. assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  4155. pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
  4156. rc = moveToChild(pCur, pgno);
  4157. }
  4158. return rc;
  4159. }
  4160. /*
  4161. ** Move the cursor down to the right-most leaf entry beneath the
  4162. ** page to which it is currently pointing. Notice the difference
  4163. ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
  4164. ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
  4165. ** finds the right-most entry beneath the *page*.
  4166. **
  4167. ** The right-most entry is the one with the largest key - the last
  4168. ** key in ascending order.
  4169. */
  4170. static int moveToRightmost(BtCursor *pCur){
  4171. Pgno pgno;
  4172. int rc = SQLITE_OK;
  4173. MemPage *pPage = 0;
  4174. assert( cursorHoldsMutex(pCur) );
  4175. assert( pCur->eState==CURSOR_VALID );
  4176. while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
  4177. pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  4178. pCur->aiIdx[pCur->iPage] = pPage->nCell;
  4179. rc = moveToChild(pCur, pgno);
  4180. }
  4181. if( rc==SQLITE_OK ){
  4182. pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
  4183. pCur->info.nSize = 0;
  4184. pCur->validNKey = 0;
  4185. }
  4186. return rc;
  4187. }
  4188. /* Move the cursor to the first entry in the table. Return SQLITE_OK
  4189. ** on success. Set *pRes to 0 if the cursor actually points to something
  4190. ** or set *pRes to 1 if the table is empty.
  4191. */
  4192. int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
  4193. int rc;
  4194. assert( cursorHoldsMutex(pCur) );
  4195. assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  4196. rc = moveToRoot(pCur);
  4197. if( rc==SQLITE_OK ){
  4198. if( pCur->eState==CURSOR_INVALID ){
  4199. assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
  4200. *pRes = 1;
  4201. }else{
  4202. assert( pCur->apPage[pCur->iPage]->nCell>0 );
  4203. *pRes = 0;
  4204. rc = moveToLeftmost(pCur);
  4205. }
  4206. }
  4207. return rc;
  4208. }
  4209. /* Move the cursor to the last entry in the table. Return SQLITE_OK
  4210. ** on success. Set *pRes to 0 if the cursor actually points to something
  4211. ** or set *pRes to 1 if the table is empty.
  4212. */
  4213. int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
  4214. int rc;
  4215. assert( cursorHoldsMutex(pCur) );
  4216. assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  4217. /* If the cursor already points to the last entry, this is a no-op. */
  4218. if( CURSOR_VALID==pCur->eState && pCur->atLast ){
  4219. #ifdef SQLITE_DEBUG
  4220. /* This block serves to assert() that the cursor really does point
  4221. ** to the last entry in the b-tree. */
  4222. int ii;
  4223. for(ii=0; ii<pCur->iPage; ii++){
  4224. assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
  4225. }
  4226. assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 );
  4227. assert( pCur->apPage[pCur->iPage]->leaf );
  4228. #endif
  4229. return SQLITE_OK;
  4230. }
  4231. rc = moveToRoot(pCur);
  4232. if( rc==SQLITE_OK ){
  4233. if( CURSOR_INVALID==pCur->eState ){
  4234. assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
  4235. *pRes = 1;
  4236. }else{
  4237. assert( pCur->eState==CURSOR_VALID );
  4238. *pRes = 0;
  4239. rc = moveToRightmost(pCur);
  4240. pCur->atLast = rc==SQLITE_OK ?1:0;
  4241. }
  4242. }
  4243. return rc;
  4244. }
  4245. /* Move the cursor so that it points to an entry near the key
  4246. ** specified by pIdxKey or intKey. Return a success code.
  4247. **
  4248. ** For INTKEY tables, the intKey parameter is used. pIdxKey
  4249. ** must be NULL. For index tables, pIdxKey is used and intKey
  4250. ** is ignored.
  4251. **
  4252. ** If an exact match is not found, then the cursor is always
  4253. ** left pointing at a leaf page which would hold the entry if it
  4254. ** were present. The cursor might point to an entry that comes
  4255. ** before or after the key.
  4256. **
  4257. ** An integer is written into *pRes which is the result of
  4258. ** comparing the key with the entry to which the cursor is
  4259. ** pointing. The meaning of the integer written into
  4260. ** *pRes is as follows:
  4261. **
  4262. ** *pRes<0 The cursor is left pointing at an entry that
  4263. ** is smaller than intKey/pIdxKey or if the table is empty
  4264. ** and the cursor is therefore left point to nothing.
  4265. **
  4266. ** *pRes==0 The cursor is left pointing at an entry that
  4267. ** exactly matches intKey/pIdxKey.
  4268. **
  4269. ** *pRes>0 The cursor is left pointing at an entry that
  4270. ** is larger than intKey/pIdxKey.
  4271. **
  4272. */
  4273. int sqlite3BtreeMovetoUnpacked(
  4274. BtCursor *pCur, /* The cursor to be moved */
  4275. UnpackedRecord *pIdxKey, /* Unpacked index key */
  4276. i64 intKey, /* The table key */
  4277. int biasRight, /* If true, bias the search to the high end */
  4278. int *pRes /* Write search results here */
  4279. ){
  4280. int rc;
  4281. assert( cursorHoldsMutex(pCur) );
  4282. assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  4283. assert( pRes );
  4284. assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
  4285. /* If the cursor is already positioned at the point we are trying
  4286. ** to move to, then just return without doing any work */
  4287. if( pCur->eState==CURSOR_VALID && pCur->validNKey
  4288. && pCur->apPage[0]->intKey
  4289. ){
  4290. if( pCur->info.nKey==intKey ){
  4291. *pRes = 0;
  4292. return SQLITE_OK;
  4293. }
  4294. if( pCur->atLast && pCur->info.nKey<intKey ){
  4295. *pRes = -1;
  4296. return SQLITE_OK;
  4297. }
  4298. }
  4299. rc = moveToRoot(pCur);
  4300. if( rc ){
  4301. return rc;
  4302. }
  4303. assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] );
  4304. assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit );
  4305. assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 );
  4306. if( pCur->eState==CURSOR_INVALID ){
  4307. *pRes = -1;
  4308. assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 );
  4309. return SQLITE_OK;
  4310. }
  4311. assert( pCur->apPage[0]->intKey || pIdxKey );
  4312. for(;;){
  4313. int lwr, upr, idx;
  4314. Pgno chldPg;
  4315. MemPage *pPage = pCur->apPage[pCur->iPage];
  4316. int c;
  4317. /* pPage->nCell must be greater than zero. If this is the root-page
  4318. ** the cursor would have been INVALID above and this for(;;) loop
  4319. ** not run. If this is not the root-page, then the moveToChild() routine
  4320. ** would have already detected db corruption. Similarly, pPage must
  4321. ** be the right kind (index or table) of b-tree page. Otherwise
  4322. ** a moveToChild() or moveToRoot() call would have detected corruption. */
  4323. assert( pPage->nCell>0 );
  4324. assert( pPage->intKey==(pIdxKey==0) );
  4325. lwr = 0;
  4326. upr = pPage->nCell-1;
  4327. if( biasRight ){
  4328. pCur->aiIdx[pCur->iPage] = (u16)(idx = upr);
  4329. }else{
  4330. pCur->aiIdx[pCur->iPage] = (u16)(idx = (upr+lwr)/2);
  4331. }
  4332. for(;;){
  4333. u8 *pCell; /* Pointer to current cell in pPage */
  4334. assert( idx==pCur->aiIdx[pCur->iPage] );
  4335. pCur->info.nSize = 0;
  4336. pCell = findCell(pPage, idx) + pPage->childPtrSize;
  4337. if( pPage->intKey ){
  4338. i64 nCellKey;
  4339. if( pPage->hasData ){
  4340. u32 dummy;
  4341. pCell += getVarint32(pCell, dummy);
  4342. }
  4343. getVarint(pCell, (u64*)&nCellKey);
  4344. if( nCellKey==intKey ){
  4345. c = 0;
  4346. }else if( nCellKey<intKey ){
  4347. c = -1;
  4348. }else{
  4349. assert( nCellKey>intKey );
  4350. c = +1;
  4351. }
  4352. pCur->validNKey = 1;
  4353. pCur->info.nKey = nCellKey;
  4354. }else{
  4355. /* The maximum supported page-size is 65536 bytes. This means that
  4356. ** the maximum number of record bytes stored on an index B-Tree
  4357. ** page is less than 16384 bytes and may be stored as a 2-byte
  4358. ** varint. This information is used to attempt to avoid parsing
  4359. ** the entire cell by checking for the cases where the record is
  4360. ** stored entirely within the b-tree page by inspecting the first
  4361. ** 2 bytes of the cell.
  4362. */
  4363. int nCell = pCell[0];
  4364. if( nCell<=pPage->max1bytePayload
  4365. /* && (pCell+nCell)<pPage->aDataEnd */
  4366. ){
  4367. /* This branch runs if the record-size field of the cell is a
  4368. ** single byte varint and the record fits entirely on the main
  4369. ** b-tree page. */
  4370. testcase( pCell+nCell+1==pPage->aDataEnd );
  4371. c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
  4372. }else if( !(pCell[1] & 0x80)
  4373. && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
  4374. /* && (pCell+nCell+2)<=pPage->aDataEnd */
  4375. ){
  4376. /* The record-size field is a 2 byte varint and the record
  4377. ** fits entirely on the main b-tree page. */
  4378. testcase( pCell+nCell+2==pPage->aDataEnd );
  4379. c = sqlite3VdbeRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
  4380. }else{
  4381. /* The record flows over onto one or more overflow pages. In
  4382. ** this case the whole cell needs to be parsed, a buffer allocated
  4383. ** and accessPayload() used to retrieve the record into the
  4384. ** buffer before VdbeRecordCompare() can be called. */
  4385. void *pCellKey;
  4386. u8 * const pCellBody = pCell - pPage->childPtrSize;
  4387. btreeParseCellPtr(pPage, pCellBody, &pCur->info);
  4388. nCell = (int)pCur->info.nKey;
  4389. pCellKey = sqlite3Malloc( nCell );
  4390. if( pCellKey==0 ){
  4391. rc = SQLITE_NOMEM;
  4392. goto moveto_finish;
  4393. }
  4394. rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
  4395. if( rc ){
  4396. sqlite3_free(pCellKey);
  4397. goto moveto_finish;
  4398. }
  4399. c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
  4400. sqlite3_free(pCellKey);
  4401. }
  4402. }
  4403. if( c==0 ){
  4404. if( pPage->intKey && !pPage->leaf ){
  4405. lwr = idx;
  4406. break;
  4407. }else{
  4408. *pRes = 0;
  4409. rc = SQLITE_OK;
  4410. goto moveto_finish;
  4411. }
  4412. }
  4413. if( c<0 ){
  4414. lwr = idx+1;
  4415. }else{
  4416. upr = idx-1;
  4417. }
  4418. if( lwr>upr ){
  4419. break;
  4420. }
  4421. pCur->aiIdx[pCur->iPage] = (u16)(idx = (lwr+upr)/2);
  4422. }
  4423. assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
  4424. assert( pPage->isInit );
  4425. if( pPage->leaf ){
  4426. chldPg = 0;
  4427. }else if( lwr>=pPage->nCell ){
  4428. chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  4429. }else{
  4430. chldPg = get4byte(findCell(pPage, lwr));
  4431. }
  4432. if( chldPg==0 ){
  4433. assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
  4434. *pRes = c;
  4435. rc = SQLITE_OK;
  4436. goto moveto_finish;
  4437. }
  4438. pCur->aiIdx[pCur->iPage] = (u16)lwr;
  4439. pCur->info.nSize = 0;
  4440. pCur->validNKey = 0;
  4441. rc = moveToChild(pCur, chldPg);
  4442. if( rc ) goto moveto_finish;
  4443. }
  4444. moveto_finish:
  4445. return rc;
  4446. }
  4447. /*
  4448. ** Return TRUE if the cursor is not pointing at an entry of the table.
  4449. **
  4450. ** TRUE will be returned after a call to sqlite3BtreeNext() moves
  4451. ** past the last entry in the table or sqlite3BtreePrev() moves past
  4452. ** the first entry. TRUE is also returned if the table is empty.
  4453. */
  4454. int sqlite3BtreeEof(BtCursor *pCur){
  4455. /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
  4456. ** have been deleted? This API will need to change to return an error code
  4457. ** as well as the boolean result value.
  4458. */
  4459. return (CURSOR_VALID!=pCur->eState);
  4460. }
  4461. /*
  4462. ** Advance the cursor to the next entry in the database. If
  4463. ** successful then set *pRes=0. If the cursor
  4464. ** was already pointing to the last entry in the database before
  4465. ** this routine was called, then set *pRes=1.
  4466. */
  4467. int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
  4468. int rc;
  4469. int idx;
  4470. MemPage *pPage;
  4471. assert( cursorHoldsMutex(pCur) );
  4472. assert( pRes!=0 );
  4473. assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
  4474. if( pCur->eState!=CURSOR_VALID ){
  4475. rc = restoreCursorPosition(pCur);
  4476. if( rc!=SQLITE_OK ){
  4477. *pRes = 0;
  4478. return rc;
  4479. }
  4480. if( CURSOR_INVALID==pCur->eState ){
  4481. *pRes = 1;
  4482. return SQLITE_OK;
  4483. }
  4484. if( pCur->skipNext ){
  4485. assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
  4486. pCur->eState = CURSOR_VALID;
  4487. if( pCur->skipNext>0 ){
  4488. pCur->skipNext = 0;
  4489. *pRes = 0;
  4490. return SQLITE_OK;
  4491. }
  4492. pCur->skipNext = 0;
  4493. }
  4494. }
  4495. pPage = pCur->apPage[pCur->iPage];
  4496. idx = ++pCur->aiIdx[pCur->iPage];
  4497. assert( pPage->isInit );
  4498. /* If the database file is corrupt, it is possible for the value of idx
  4499. ** to be invalid here. This can only occur if a second cursor modifies
  4500. ** the page while cursor pCur is holding a reference to it. Which can
  4501. ** only happen if the database is corrupt in such a way as to link the
  4502. ** page into more than one b-tree structure. */
  4503. testcase( idx>pPage->nCell );
  4504. pCur->info.nSize = 0;
  4505. pCur->validNKey = 0;
  4506. if( idx>=pPage->nCell ){
  4507. if( !pPage->leaf ){
  4508. rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
  4509. if( rc ){
  4510. *pRes = 0;
  4511. return rc;
  4512. }
  4513. rc = moveToLeftmost(pCur);
  4514. *pRes = 0;
  4515. return rc;
  4516. }
  4517. do{
  4518. if( pCur->iPage==0 ){
  4519. *pRes = 1;
  4520. pCur->eState = CURSOR_INVALID;
  4521. return SQLITE_OK;
  4522. }
  4523. moveToParent(pCur);
  4524. pPage = pCur->apPage[pCur->iPage];
  4525. }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
  4526. *pRes = 0;
  4527. if( pPage->intKey ){
  4528. rc = sqlite3BtreeNext(pCur, pRes);
  4529. }else{
  4530. rc = SQLITE_OK;
  4531. }
  4532. return rc;
  4533. }
  4534. *pRes = 0;
  4535. if( pPage->leaf ){
  4536. return SQLITE_OK;
  4537. }
  4538. rc = moveToLeftmost(pCur);
  4539. return rc;
  4540. }
  4541. /*
  4542. ** Step the cursor to the back to the previous entry in the database. If
  4543. ** successful then set *pRes=0. If the cursor
  4544. ** was already pointing to the first entry in the database before
  4545. ** this routine was called, then set *pRes=1.
  4546. */
  4547. int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
  4548. int rc;
  4549. MemPage *pPage;
  4550. assert( cursorHoldsMutex(pCur) );
  4551. assert( pRes!=0 );
  4552. assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
  4553. pCur->atLast = 0;
  4554. if( pCur->eState!=CURSOR_VALID ){
  4555. if( ALWAYS(pCur->eState>=CURSOR_REQUIRESEEK) ){
  4556. rc = btreeRestoreCursorPosition(pCur);
  4557. if( rc!=SQLITE_OK ){
  4558. *pRes = 0;
  4559. return rc;
  4560. }
  4561. }
  4562. if( CURSOR_INVALID==pCur->eState ){
  4563. *pRes = 1;
  4564. return SQLITE_OK;
  4565. }
  4566. if( pCur->skipNext ){
  4567. assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
  4568. pCur->eState = CURSOR_VALID;
  4569. if( pCur->skipNext<0 ){
  4570. pCur->skipNext = 0;
  4571. *pRes = 0;
  4572. return SQLITE_OK;
  4573. }
  4574. pCur->skipNext = 0;
  4575. }
  4576. }
  4577. pPage = pCur->apPage[pCur->iPage];
  4578. assert( pPage->isInit );
  4579. if( !pPage->leaf ){
  4580. int idx = pCur->aiIdx[pCur->iPage];
  4581. rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
  4582. if( rc ){
  4583. *pRes = 0;
  4584. return rc;
  4585. }
  4586. rc = moveToRightmost(pCur);
  4587. }else{
  4588. while( pCur->aiIdx[pCur->iPage]==0 ){
  4589. if( pCur->iPage==0 ){
  4590. pCur->eState = CURSOR_INVALID;
  4591. *pRes = 1;
  4592. return SQLITE_OK;
  4593. }
  4594. moveToParent(pCur);
  4595. }
  4596. pCur->info.nSize = 0;
  4597. pCur->validNKey = 0;
  4598. pCur->aiIdx[pCur->iPage]--;
  4599. pPage = pCur->apPage[pCur->iPage];
  4600. if( pPage->intKey && !pPage->leaf ){
  4601. rc = sqlite3BtreePrevious(pCur, pRes);
  4602. }else{
  4603. rc = SQLITE_OK;
  4604. }
  4605. }
  4606. *pRes = 0;
  4607. return rc;
  4608. }
  4609. /*
  4610. ** Allocate a new page from the database file.
  4611. **
  4612. ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
  4613. ** has already been called on the new page.) The new page has also
  4614. ** been referenced and the calling routine is responsible for calling
  4615. ** sqlite3PagerUnref() on the new page when it is done.
  4616. **
  4617. ** SQLITE_OK is returned on success. Any other return value indicates
  4618. ** an error. *ppPage and *pPgno are undefined in the event of an error.
  4619. ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
  4620. **
  4621. ** If the "nearby" parameter is not 0, then an effort is made to
  4622. ** locate a page close to the page number "nearby". This can be used in an
  4623. ** attempt to keep related pages close to each other in the database file,
  4624. ** which in turn can make database access faster.
  4625. **
  4626. ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
  4627. ** anywhere on the free-list, then it is guaranteed to be returned. If
  4628. ** eMode is BTALLOC_LT then the page returned will be less than or equal
  4629. ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there
  4630. ** are no restrictions on which page is returned.
  4631. */
  4632. static int allocateBtreePage(
  4633. BtShared *pBt, /* The btree */
  4634. MemPage **ppPage, /* Store pointer to the allocated page here */
  4635. Pgno *pPgno, /* Store the page number here */
  4636. Pgno nearby, /* Search for a page near this one */
  4637. u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
  4638. ){
  4639. MemPage *pPage1;
  4640. int rc;
  4641. u32 n; /* Number of pages on the freelist */
  4642. u32 k; /* Number of leaves on the trunk of the freelist */
  4643. MemPage *pTrunk = 0;
  4644. MemPage *pPrevTrunk = 0;
  4645. Pgno mxPage; /* Total size of the database file */
  4646. assert( sqlite3_mutex_held(pBt->mutex) );
  4647. assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
  4648. pPage1 = pBt->pPage1;
  4649. mxPage = btreePagecount(pBt);
  4650. n = get4byte(&pPage1->aData[36]);
  4651. testcase( n==mxPage-1 );
  4652. if( n>=mxPage ){
  4653. return SQLITE_CORRUPT_BKPT;
  4654. }
  4655. if( n>0 ){
  4656. /* There are pages on the freelist. Reuse one of those pages. */
  4657. Pgno iTrunk;
  4658. u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
  4659. /* If eMode==BTALLOC_EXACT and a query of the pointer-map
  4660. ** shows that the page 'nearby' is somewhere on the free-list, then
  4661. ** the entire-list will be searched for that page.
  4662. */
  4663. #ifndef SQLITE_OMIT_AUTOVACUUM
  4664. if( eMode==BTALLOC_EXACT ){
  4665. if( nearby<=mxPage ){
  4666. u8 eType;
  4667. assert( nearby>0 );
  4668. assert( pBt->autoVacuum );
  4669. rc = ptrmapGet(pBt, nearby, &eType, 0);
  4670. if( rc ) return rc;
  4671. if( eType==PTRMAP_FREEPAGE ){
  4672. searchList = 1;
  4673. }
  4674. }
  4675. }else if( eMode==BTALLOC_LE ){
  4676. searchList = 1;
  4677. }
  4678. #endif
  4679. /* Decrement the free-list count by 1. Set iTrunk to the index of the
  4680. ** first free-list trunk page. iPrevTrunk is initially 1.
  4681. */
  4682. rc = sqlite3PagerWrite(pPage1->pDbPage);
  4683. if( rc ) return rc;
  4684. put4byte(&pPage1->aData[36], n-1);
  4685. /* The code within this loop is run only once if the 'searchList' variable
  4686. ** is not true. Otherwise, it runs once for each trunk-page on the
  4687. ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
  4688. ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
  4689. */
  4690. do {
  4691. pPrevTrunk = pTrunk;
  4692. if( pPrevTrunk ){
  4693. iTrunk = get4byte(&pPrevTrunk->aData[0]);
  4694. }else{
  4695. iTrunk = get4byte(&pPage1->aData[32]);
  4696. }
  4697. testcase( iTrunk==mxPage );
  4698. if( iTrunk>mxPage ){
  4699. rc = SQLITE_CORRUPT_BKPT;
  4700. }else{
  4701. rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
  4702. }
  4703. if( rc ){
  4704. pTrunk = 0;
  4705. goto end_allocate_page;
  4706. }
  4707. assert( pTrunk!=0 );
  4708. assert( pTrunk->aData!=0 );
  4709. k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */
  4710. if( k==0 && !searchList ){
  4711. /* The trunk has no leaves and the list is not being searched.
  4712. ** So extract the trunk page itself and use it as the newly
  4713. ** allocated page */
  4714. assert( pPrevTrunk==0 );
  4715. rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4716. if( rc ){
  4717. goto end_allocate_page;
  4718. }
  4719. *pPgno = iTrunk;
  4720. memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
  4721. *ppPage = pTrunk;
  4722. pTrunk = 0;
  4723. TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
  4724. }else if( k>(u32)(pBt->usableSize/4 - 2) ){
  4725. /* Value of k is out of range. Database corruption */
  4726. rc = SQLITE_CORRUPT_BKPT;
  4727. goto end_allocate_page;
  4728. #ifndef SQLITE_OMIT_AUTOVACUUM
  4729. }else if( searchList
  4730. && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
  4731. ){
  4732. /* The list is being searched and this trunk page is the page
  4733. ** to allocate, regardless of whether it has leaves.
  4734. */
  4735. *pPgno = iTrunk;
  4736. *ppPage = pTrunk;
  4737. searchList = 0;
  4738. rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4739. if( rc ){
  4740. goto end_allocate_page;
  4741. }
  4742. if( k==0 ){
  4743. if( !pPrevTrunk ){
  4744. memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
  4745. }else{
  4746. rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
  4747. if( rc!=SQLITE_OK ){
  4748. goto end_allocate_page;
  4749. }
  4750. memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
  4751. }
  4752. }else{
  4753. /* The trunk page is required by the caller but it contains
  4754. ** pointers to free-list leaves. The first leaf becomes a trunk
  4755. ** page in this case.
  4756. */
  4757. MemPage *pNewTrunk;
  4758. Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
  4759. if( iNewTrunk>mxPage ){
  4760. rc = SQLITE_CORRUPT_BKPT;
  4761. goto end_allocate_page;
  4762. }
  4763. testcase( iNewTrunk==mxPage );
  4764. rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
  4765. if( rc!=SQLITE_OK ){
  4766. goto end_allocate_page;
  4767. }
  4768. rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
  4769. if( rc!=SQLITE_OK ){
  4770. releasePage(pNewTrunk);
  4771. goto end_allocate_page;
  4772. }
  4773. memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
  4774. put4byte(&pNewTrunk->aData[4], k-1);
  4775. memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
  4776. releasePage(pNewTrunk);
  4777. if( !pPrevTrunk ){
  4778. assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
  4779. put4byte(&pPage1->aData[32], iNewTrunk);
  4780. }else{
  4781. rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
  4782. if( rc ){
  4783. goto end_allocate_page;
  4784. }
  4785. put4byte(&pPrevTrunk->aData[0], iNewTrunk);
  4786. }
  4787. }
  4788. pTrunk = 0;
  4789. TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
  4790. #endif
  4791. }else if( k>0 ){
  4792. /* Extract a leaf from the trunk */
  4793. u32 closest;
  4794. Pgno iPage;
  4795. unsigned char *aData = pTrunk->aData;
  4796. if( nearby>0 ){
  4797. u32 i;
  4798. closest = 0;
  4799. if( eMode==BTALLOC_LE ){
  4800. for(i=0; i<k; i++){
  4801. iPage = get4byte(&aData[8+i*4]);
  4802. if( iPage<=nearby ){
  4803. closest = i;
  4804. break;
  4805. }
  4806. }
  4807. }else{
  4808. int dist;
  4809. dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
  4810. for(i=1; i<k; i++){
  4811. int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
  4812. if( d2<dist ){
  4813. closest = i;
  4814. dist = d2;
  4815. }
  4816. }
  4817. }
  4818. }else{
  4819. closest = 0;
  4820. }
  4821. iPage = get4byte(&aData[8+closest*4]);
  4822. testcase( iPage==mxPage );
  4823. if( iPage>mxPage ){
  4824. rc = SQLITE_CORRUPT_BKPT;
  4825. goto end_allocate_page;
  4826. }
  4827. testcase( iPage==mxPage );
  4828. if( !searchList
  4829. || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
  4830. ){
  4831. int noContent;
  4832. *pPgno = iPage;
  4833. TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
  4834. ": %d more free pages\n",
  4835. *pPgno, closest+1, k, pTrunk->pgno, n-1));
  4836. rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4837. if( rc ) goto end_allocate_page;
  4838. if( closest<k-1 ){
  4839. memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
  4840. }
  4841. put4byte(&aData[4], k-1);
  4842. noContent = !btreeGetHasContent(pBt, *pPgno) ? PAGER_GET_NOCONTENT : 0;
  4843. rc = btreeGetPage(pBt, *pPgno, ppPage, noContent);
  4844. if( rc==SQLITE_OK ){
  4845. rc = sqlite3PagerWrite((*ppPage)->pDbPage);
  4846. if( rc!=SQLITE_OK ){
  4847. releasePage(*ppPage);
  4848. }
  4849. }
  4850. searchList = 0;
  4851. }
  4852. }
  4853. releasePage(pPrevTrunk);
  4854. pPrevTrunk = 0;
  4855. }while( searchList );
  4856. }else{
  4857. /* There are no pages on the freelist, so append a new page to the
  4858. ** database image.
  4859. **
  4860. ** Normally, new pages allocated by this block can be requested from the
  4861. ** pager layer with the 'no-content' flag set. This prevents the pager
  4862. ** from trying to read the pages content from disk. However, if the
  4863. ** current transaction has already run one or more incremental-vacuum
  4864. ** steps, then the page we are about to allocate may contain content
  4865. ** that is required in the event of a rollback. In this case, do
  4866. ** not set the no-content flag. This causes the pager to load and journal
  4867. ** the current page content before overwriting it.
  4868. **
  4869. ** Note that the pager will not actually attempt to load or journal
  4870. ** content for any page that really does lie past the end of the database
  4871. ** file on disk. So the effects of disabling the no-content optimization
  4872. ** here are confined to those pages that lie between the end of the
  4873. ** database image and the end of the database file.
  4874. */
  4875. int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate)) ? PAGER_GET_NOCONTENT : 0;
  4876. rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  4877. if( rc ) return rc;
  4878. pBt->nPage++;
  4879. if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
  4880. #ifndef SQLITE_OMIT_AUTOVACUUM
  4881. if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
  4882. /* If *pPgno refers to a pointer-map page, allocate two new pages
  4883. ** at the end of the file instead of one. The first allocated page
  4884. ** becomes a new pointer-map page, the second is used by the caller.
  4885. */
  4886. MemPage *pPg = 0;
  4887. TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
  4888. assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
  4889. rc = btreeGetPage(pBt, pBt->nPage, &pPg, bNoContent);
  4890. if( rc==SQLITE_OK ){
  4891. rc = sqlite3PagerWrite(pPg->pDbPage);
  4892. releasePage(pPg);
  4893. }
  4894. if( rc ) return rc;
  4895. pBt->nPage++;
  4896. if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
  4897. }
  4898. #endif
  4899. put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
  4900. *pPgno = pBt->nPage;
  4901. assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  4902. rc = btreeGetPage(pBt, *pPgno, ppPage, bNoContent);
  4903. if( rc ) return rc;
  4904. rc = sqlite3PagerWrite((*ppPage)->pDbPage);
  4905. if( rc!=SQLITE_OK ){
  4906. releasePage(*ppPage);
  4907. }
  4908. TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
  4909. }
  4910. assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  4911. end_allocate_page:
  4912. releasePage(pTrunk);
  4913. releasePage(pPrevTrunk);
  4914. if( rc==SQLITE_OK ){
  4915. if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
  4916. releasePage(*ppPage);
  4917. return SQLITE_CORRUPT_BKPT;
  4918. }
  4919. (*ppPage)->isInit = 0;
  4920. }else{
  4921. *ppPage = 0;
  4922. }
  4923. assert( rc!=SQLITE_OK || sqlite3PagerIswriteable((*ppPage)->pDbPage) );
  4924. return rc;
  4925. }
  4926. /*
  4927. ** This function is used to add page iPage to the database file free-list.
  4928. ** It is assumed that the page is not already a part of the free-list.
  4929. **
  4930. ** The value passed as the second argument to this function is optional.
  4931. ** If the caller happens to have a pointer to the MemPage object
  4932. ** corresponding to page iPage handy, it may pass it as the second value.
  4933. ** Otherwise, it may pass NULL.
  4934. **
  4935. ** If a pointer to a MemPage object is passed as the second argument,
  4936. ** its reference count is not altered by this function.
  4937. */
  4938. static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
  4939. MemPage *pTrunk = 0; /* Free-list trunk page */
  4940. Pgno iTrunk = 0; /* Page number of free-list trunk page */
  4941. MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */
  4942. MemPage *pPage; /* Page being freed. May be NULL. */
  4943. int rc; /* Return Code */
  4944. int nFree; /* Initial number of pages on free-list */
  4945. assert( sqlite3_mutex_held(pBt->mutex) );
  4946. assert( iPage>1 );
  4947. assert( !pMemPage || pMemPage->pgno==iPage );
  4948. if( pMemPage ){
  4949. pPage = pMemPage;
  4950. sqlite3PagerRef(pPage->pDbPage);
  4951. }else{
  4952. pPage = btreePageLookup(pBt, iPage);
  4953. }
  4954. /* Increment the free page count on pPage1 */
  4955. rc = sqlite3PagerWrite(pPage1->pDbPage);
  4956. if( rc ) goto freepage_out;
  4957. nFree = get4byte(&pPage1->aData[36]);
  4958. put4byte(&pPage1->aData[36], nFree+1);
  4959. if( pBt->btsFlags & BTS_SECURE_DELETE ){
  4960. /* If the secure_delete option is enabled, then
  4961. ** always fully overwrite deleted information with zeros.
  4962. */
  4963. if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
  4964. || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
  4965. ){
  4966. goto freepage_out;
  4967. }
  4968. memset(pPage->aData, 0, pPage->pBt->pageSize);
  4969. }
  4970. /* If the database supports auto-vacuum, write an entry in the pointer-map
  4971. ** to indicate that the page is free.
  4972. */
  4973. if( ISAUTOVACUUM ){
  4974. ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
  4975. if( rc ) goto freepage_out;
  4976. }
  4977. /* Now manipulate the actual database free-list structure. There are two
  4978. ** possibilities. If the free-list is currently empty, or if the first
  4979. ** trunk page in the free-list is full, then this page will become a
  4980. ** new free-list trunk page. Otherwise, it will become a leaf of the
  4981. ** first trunk page in the current free-list. This block tests if it
  4982. ** is possible to add the page as a new free-list leaf.
  4983. */
  4984. if( nFree!=0 ){
  4985. u32 nLeaf; /* Initial number of leaf cells on trunk page */
  4986. iTrunk = get4byte(&pPage1->aData[32]);
  4987. rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
  4988. if( rc!=SQLITE_OK ){
  4989. goto freepage_out;
  4990. }
  4991. nLeaf = get4byte(&pTrunk->aData[4]);
  4992. assert( pBt->usableSize>32 );
  4993. if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
  4994. rc = SQLITE_CORRUPT_BKPT;
  4995. goto freepage_out;
  4996. }
  4997. if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
  4998. /* In this case there is room on the trunk page to insert the page
  4999. ** being freed as a new leaf.
  5000. **
  5001. ** Note that the trunk page is not really full until it contains
  5002. ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
  5003. ** coded. But due to a coding error in versions of SQLite prior to
  5004. ** 3.6.0, databases with freelist trunk pages holding more than
  5005. ** usableSize/4 - 8 entries will be reported as corrupt. In order
  5006. ** to maintain backwards compatibility with older versions of SQLite,
  5007. ** we will continue to restrict the number of entries to usableSize/4 - 8
  5008. ** for now. At some point in the future (once everyone has upgraded
  5009. ** to 3.6.0 or later) we should consider fixing the conditional above
  5010. ** to read "usableSize/4-2" instead of "usableSize/4-8".
  5011. */
  5012. rc = sqlite3PagerWrite(pTrunk->pDbPage);
  5013. if( rc==SQLITE_OK ){
  5014. put4byte(&pTrunk->aData[4], nLeaf+1);
  5015. put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
  5016. if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
  5017. sqlite3PagerDontWrite(pPage->pDbPage);
  5018. }
  5019. rc = btreeSetHasContent(pBt, iPage);
  5020. }
  5021. TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
  5022. goto freepage_out;
  5023. }
  5024. }
  5025. /* If control flows to this point, then it was not possible to add the
  5026. ** the page being freed as a leaf page of the first trunk in the free-list.
  5027. ** Possibly because the free-list is empty, or possibly because the
  5028. ** first trunk in the free-list is full. Either way, the page being freed
  5029. ** will become the new first trunk page in the free-list.
  5030. */
  5031. if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
  5032. goto freepage_out;
  5033. }
  5034. rc = sqlite3PagerWrite(pPage->pDbPage);
  5035. if( rc!=SQLITE_OK ){
  5036. goto freepage_out;
  5037. }
  5038. put4byte(pPage->aData, iTrunk);
  5039. put4byte(&pPage->aData[4], 0);
  5040. put4byte(&pPage1->aData[32], iPage);
  5041. TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
  5042. freepage_out:
  5043. if( pPage ){
  5044. pPage->isInit = 0;
  5045. }
  5046. releasePage(pPage);
  5047. releasePage(pTrunk);
  5048. return rc;
  5049. }
  5050. static void freePage(MemPage *pPage, int *pRC){
  5051. if( (*pRC)==SQLITE_OK ){
  5052. *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
  5053. }
  5054. }
  5055. /*
  5056. ** Free any overflow pages associated with the given Cell.
  5057. */
  5058. static int clearCell(MemPage *pPage, unsigned char *pCell){
  5059. BtShared *pBt = pPage->pBt;
  5060. CellInfo info;
  5061. Pgno ovflPgno;
  5062. int rc;
  5063. int nOvfl;
  5064. u32 ovflPageSize;
  5065. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5066. btreeParseCellPtr(pPage, pCell, &info);
  5067. if( info.iOverflow==0 ){
  5068. return SQLITE_OK; /* No overflow pages. Return without doing anything */
  5069. }
  5070. if( pCell+info.iOverflow+3 > pPage->aData+pPage->maskPage ){
  5071. return SQLITE_CORRUPT_BKPT; /* Cell extends past end of page */
  5072. }
  5073. ovflPgno = get4byte(&pCell[info.iOverflow]);
  5074. assert( pBt->usableSize > 4 );
  5075. ovflPageSize = pBt->usableSize - 4;
  5076. nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
  5077. assert( ovflPgno==0 || nOvfl>0 );
  5078. while( nOvfl-- ){
  5079. Pgno iNext = 0;
  5080. MemPage *pOvfl = 0;
  5081. if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
  5082. /* 0 is not a legal page number and page 1 cannot be an
  5083. ** overflow page. Therefore if ovflPgno<2 or past the end of the
  5084. ** file the database must be corrupt. */
  5085. return SQLITE_CORRUPT_BKPT;
  5086. }
  5087. if( nOvfl ){
  5088. rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
  5089. if( rc ) return rc;
  5090. }
  5091. if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
  5092. && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
  5093. ){
  5094. /* There is no reason any cursor should have an outstanding reference
  5095. ** to an overflow page belonging to a cell that is being deleted/updated.
  5096. ** So if there exists more than one reference to this page, then it
  5097. ** must not really be an overflow page and the database must be corrupt.
  5098. ** It is helpful to detect this before calling freePage2(), as
  5099. ** freePage2() may zero the page contents if secure-delete mode is
  5100. ** enabled. If this 'overflow' page happens to be a page that the
  5101. ** caller is iterating through or using in some other way, this
  5102. ** can be problematic.
  5103. */
  5104. rc = SQLITE_CORRUPT_BKPT;
  5105. }else{
  5106. rc = freePage2(pBt, pOvfl, ovflPgno);
  5107. }
  5108. if( pOvfl ){
  5109. sqlite3PagerUnref(pOvfl->pDbPage);
  5110. }
  5111. if( rc ) return rc;
  5112. ovflPgno = iNext;
  5113. }
  5114. return SQLITE_OK;
  5115. }
  5116. /*
  5117. ** Create the byte sequence used to represent a cell on page pPage
  5118. ** and write that byte sequence into pCell[]. Overflow pages are
  5119. ** allocated and filled in as necessary. The calling procedure
  5120. ** is responsible for making sure sufficient space has been allocated
  5121. ** for pCell[].
  5122. **
  5123. ** Note that pCell does not necessary need to point to the pPage->aData
  5124. ** area. pCell might point to some temporary storage. The cell will
  5125. ** be constructed in this temporary area then copied into pPage->aData
  5126. ** later.
  5127. */
  5128. static int fillInCell(
  5129. MemPage *pPage, /* The page that contains the cell */
  5130. unsigned char *pCell, /* Complete text of the cell */
  5131. const void *pKey, i64 nKey, /* The key */
  5132. const void *pData,int nData, /* The data */
  5133. int nZero, /* Extra zero bytes to append to pData */
  5134. int *pnSize /* Write cell size here */
  5135. ){
  5136. int nPayload;
  5137. const u8 *pSrc;
  5138. int nSrc, n, rc;
  5139. int spaceLeft;
  5140. MemPage *pOvfl = 0;
  5141. MemPage *pToRelease = 0;
  5142. unsigned char *pPrior;
  5143. unsigned char *pPayload;
  5144. BtShared *pBt = pPage->pBt;
  5145. Pgno pgnoOvfl = 0;
  5146. int nHeader;
  5147. CellInfo info;
  5148. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5149. /* pPage is not necessarily writeable since pCell might be auxiliary
  5150. ** buffer space that is separate from the pPage buffer area */
  5151. assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize]
  5152. || sqlite3PagerIswriteable(pPage->pDbPage) );
  5153. /* Fill in the header. */
  5154. nHeader = 0;
  5155. if( !pPage->leaf ){
  5156. nHeader += 4;
  5157. }
  5158. if( pPage->hasData ){
  5159. nHeader += putVarint(&pCell[nHeader], nData+nZero);
  5160. }else{
  5161. nData = nZero = 0;
  5162. }
  5163. nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
  5164. btreeParseCellPtr(pPage, pCell, &info);
  5165. assert( info.nHeader==nHeader );
  5166. assert( info.nKey==nKey );
  5167. assert( info.nData==(u32)(nData+nZero) );
  5168. /* Fill in the payload */
  5169. nPayload = nData + nZero;
  5170. if( pPage->intKey ){
  5171. pSrc = pData;
  5172. nSrc = nData;
  5173. nData = 0;
  5174. }else{
  5175. if( NEVER(nKey>0x7fffffff || pKey==0) ){
  5176. return SQLITE_CORRUPT_BKPT;
  5177. }
  5178. nPayload += (int)nKey;
  5179. pSrc = pKey;
  5180. nSrc = (int)nKey;
  5181. }
  5182. *pnSize = info.nSize;
  5183. spaceLeft = info.nLocal;
  5184. pPayload = &pCell[nHeader];
  5185. pPrior = &pCell[info.iOverflow];
  5186. while( nPayload>0 ){
  5187. if( spaceLeft==0 ){
  5188. #ifndef SQLITE_OMIT_AUTOVACUUM
  5189. Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
  5190. if( pBt->autoVacuum ){
  5191. do{
  5192. pgnoOvfl++;
  5193. } while(
  5194. PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
  5195. );
  5196. }
  5197. #endif
  5198. rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
  5199. #ifndef SQLITE_OMIT_AUTOVACUUM
  5200. /* If the database supports auto-vacuum, and the second or subsequent
  5201. ** overflow page is being allocated, add an entry to the pointer-map
  5202. ** for that page now.
  5203. **
  5204. ** If this is the first overflow page, then write a partial entry
  5205. ** to the pointer-map. If we write nothing to this pointer-map slot,
  5206. ** then the optimistic overflow chain processing in clearCell()
  5207. ** may misinterpret the uninitialized values and delete the
  5208. ** wrong pages from the database.
  5209. */
  5210. if( pBt->autoVacuum && rc==SQLITE_OK ){
  5211. u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
  5212. ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
  5213. if( rc ){
  5214. releasePage(pOvfl);
  5215. }
  5216. }
  5217. #endif
  5218. if( rc ){
  5219. releasePage(pToRelease);
  5220. return rc;
  5221. }
  5222. /* If pToRelease is not zero than pPrior points into the data area
  5223. ** of pToRelease. Make sure pToRelease is still writeable. */
  5224. assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
  5225. /* If pPrior is part of the data area of pPage, then make sure pPage
  5226. ** is still writeable */
  5227. assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
  5228. || sqlite3PagerIswriteable(pPage->pDbPage) );
  5229. put4byte(pPrior, pgnoOvfl);
  5230. releasePage(pToRelease);
  5231. pToRelease = pOvfl;
  5232. pPrior = pOvfl->aData;
  5233. put4byte(pPrior, 0);
  5234. pPayload = &pOvfl->aData[4];
  5235. spaceLeft = pBt->usableSize - 4;
  5236. }
  5237. n = nPayload;
  5238. if( n>spaceLeft ) n = spaceLeft;
  5239. /* If pToRelease is not zero than pPayload points into the data area
  5240. ** of pToRelease. Make sure pToRelease is still writeable. */
  5241. assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
  5242. /* If pPayload is part of the data area of pPage, then make sure pPage
  5243. ** is still writeable */
  5244. assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
  5245. || sqlite3PagerIswriteable(pPage->pDbPage) );
  5246. if( nSrc>0 ){
  5247. if( n>nSrc ) n = nSrc;
  5248. assert( pSrc );
  5249. memcpy(pPayload, pSrc, n);
  5250. }else{
  5251. memset(pPayload, 0, n);
  5252. }
  5253. nPayload -= n;
  5254. pPayload += n;
  5255. pSrc += n;
  5256. nSrc -= n;
  5257. spaceLeft -= n;
  5258. if( nSrc==0 ){
  5259. nSrc = nData;
  5260. pSrc = pData;
  5261. }
  5262. }
  5263. releasePage(pToRelease);
  5264. return SQLITE_OK;
  5265. }
  5266. /*
  5267. ** Remove the i-th cell from pPage. This routine effects pPage only.
  5268. ** The cell content is not freed or deallocated. It is assumed that
  5269. ** the cell content has been copied someplace else. This routine just
  5270. ** removes the reference to the cell from pPage.
  5271. **
  5272. ** "sz" must be the number of bytes in the cell.
  5273. */
  5274. static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
  5275. u32 pc; /* Offset to cell content of cell being deleted */
  5276. u8 *data; /* pPage->aData */
  5277. u8 *ptr; /* Used to move bytes around within data[] */
  5278. u8 *endPtr; /* End of loop */
  5279. int rc; /* The return code */
  5280. int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */
  5281. if( *pRC ) return;
  5282. assert( idx>=0 && idx<pPage->nCell );
  5283. assert( sz==cellSize(pPage, idx) );
  5284. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  5285. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5286. data = pPage->aData;
  5287. ptr = &pPage->aCellIdx[2*idx];
  5288. pc = get2byte(ptr);
  5289. hdr = pPage->hdrOffset;
  5290. testcase( pc==get2byte(&data[hdr+5]) );
  5291. testcase( pc+sz==pPage->pBt->usableSize );
  5292. if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){
  5293. *pRC = SQLITE_CORRUPT_BKPT;
  5294. return;
  5295. }
  5296. rc = freeSpace(pPage, pc, sz);
  5297. if( rc ){
  5298. *pRC = rc;
  5299. return;
  5300. }
  5301. endPtr = &pPage->aCellIdx[2*pPage->nCell - 2];
  5302. assert( (SQLITE_PTR_TO_INT(ptr)&1)==0 ); /* ptr is always 2-byte aligned */
  5303. while( ptr<endPtr ){
  5304. *(u16*)ptr = *(u16*)&ptr[2];
  5305. ptr += 2;
  5306. }
  5307. pPage->nCell--;
  5308. put2byte(&data[hdr+3], pPage->nCell);
  5309. pPage->nFree += 2;
  5310. }
  5311. /*
  5312. ** Insert a new cell on pPage at cell index "i". pCell points to the
  5313. ** content of the cell.
  5314. **
  5315. ** If the cell content will fit on the page, then put it there. If it
  5316. ** will not fit, then make a copy of the cell content into pTemp if
  5317. ** pTemp is not null. Regardless of pTemp, allocate a new entry
  5318. ** in pPage->apOvfl[] and make it point to the cell content (either
  5319. ** in pTemp or the original pCell) and also record its index.
  5320. ** Allocating a new entry in pPage->aCell[] implies that
  5321. ** pPage->nOverflow is incremented.
  5322. **
  5323. ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
  5324. ** cell. The caller will overwrite them after this function returns. If
  5325. ** nSkip is non-zero, then pCell may not point to an invalid memory location
  5326. ** (but pCell+nSkip is always valid).
  5327. */
  5328. static void insertCell(
  5329. MemPage *pPage, /* Page into which we are copying */
  5330. int i, /* New cell becomes the i-th cell of the page */
  5331. u8 *pCell, /* Content of the new cell */
  5332. int sz, /* Bytes of content in pCell */
  5333. u8 *pTemp, /* Temp storage space for pCell, if needed */
  5334. Pgno iChild, /* If non-zero, replace first 4 bytes with this value */
  5335. int *pRC /* Read and write return code from here */
  5336. ){
  5337. int idx = 0; /* Where to write new cell content in data[] */
  5338. int j; /* Loop counter */
  5339. int end; /* First byte past the last cell pointer in data[] */
  5340. int ins; /* Index in data[] where new cell pointer is inserted */
  5341. int cellOffset; /* Address of first cell pointer in data[] */
  5342. u8 *data; /* The content of the whole page */
  5343. u8 *ptr; /* Used for moving information around in data[] */
  5344. u8 *endPtr; /* End of the loop */
  5345. int nSkip = (iChild ? 4 : 0);
  5346. if( *pRC ) return;
  5347. assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
  5348. assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=10921 );
  5349. assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
  5350. assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
  5351. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5352. /* The cell should normally be sized correctly. However, when moving a
  5353. ** malformed cell from a leaf page to an interior page, if the cell size
  5354. ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
  5355. ** might be less than 8 (leaf-size + pointer) on the interior node. Hence
  5356. ** the term after the || in the following assert(). */
  5357. assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) );
  5358. if( pPage->nOverflow || sz+2>pPage->nFree ){
  5359. if( pTemp ){
  5360. memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
  5361. pCell = pTemp;
  5362. }
  5363. if( iChild ){
  5364. put4byte(pCell, iChild);
  5365. }
  5366. j = pPage->nOverflow++;
  5367. assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) );
  5368. pPage->apOvfl[j] = pCell;
  5369. pPage->aiOvfl[j] = (u16)i;
  5370. }else{
  5371. int rc = sqlite3PagerWrite(pPage->pDbPage);
  5372. if( rc!=SQLITE_OK ){
  5373. *pRC = rc;
  5374. return;
  5375. }
  5376. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  5377. data = pPage->aData;
  5378. cellOffset = pPage->cellOffset;
  5379. end = cellOffset + 2*pPage->nCell;
  5380. ins = cellOffset + 2*i;
  5381. rc = allocateSpace(pPage, sz, &idx);
  5382. if( rc ){ *pRC = rc; return; }
  5383. /* The allocateSpace() routine guarantees the following two properties
  5384. ** if it returns success */
  5385. assert( idx >= end+2 );
  5386. assert( idx+sz <= (int)pPage->pBt->usableSize );
  5387. pPage->nCell++;
  5388. pPage->nFree -= (u16)(2 + sz);
  5389. memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
  5390. if( iChild ){
  5391. put4byte(&data[idx], iChild);
  5392. }
  5393. ptr = &data[end];
  5394. endPtr = &data[ins];
  5395. assert( (SQLITE_PTR_TO_INT(ptr)&1)==0 ); /* ptr is always 2-byte aligned */
  5396. while( ptr>endPtr ){
  5397. *(u16*)ptr = *(u16*)&ptr[-2];
  5398. ptr -= 2;
  5399. }
  5400. put2byte(&data[ins], idx);
  5401. put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
  5402. #ifndef SQLITE_OMIT_AUTOVACUUM
  5403. if( pPage->pBt->autoVacuum ){
  5404. /* The cell may contain a pointer to an overflow page. If so, write
  5405. ** the entry for the overflow page into the pointer map.
  5406. */
  5407. ptrmapPutOvflPtr(pPage, pCell, pRC);
  5408. }
  5409. #endif
  5410. }
  5411. }
  5412. /*
  5413. ** Add a list of cells to a page. The page should be initially empty.
  5414. ** The cells are guaranteed to fit on the page.
  5415. */
  5416. static void assemblePage(
  5417. MemPage *pPage, /* The page to be assemblied */
  5418. int nCell, /* The number of cells to add to this page */
  5419. u8 **apCell, /* Pointers to cell bodies */
  5420. u16 *aSize /* Sizes of the cells */
  5421. ){
  5422. int i; /* Loop counter */
  5423. u8 *pCellptr; /* Address of next cell pointer */
  5424. int cellbody; /* Address of next cell body */
  5425. u8 * const data = pPage->aData; /* Pointer to data for pPage */
  5426. const int hdr = pPage->hdrOffset; /* Offset of header on pPage */
  5427. const int nUsable = pPage->pBt->usableSize; /* Usable size of page */
  5428. assert( pPage->nOverflow==0 );
  5429. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5430. assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt)
  5431. && (int)MX_CELL(pPage->pBt)<=10921);
  5432. assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  5433. /* Check that the page has just been zeroed by zeroPage() */
  5434. assert( pPage->nCell==0 );
  5435. assert( get2byteNotZero(&data[hdr+5])==nUsable );
  5436. pCellptr = &pPage->aCellIdx[nCell*2];
  5437. cellbody = nUsable;
  5438. for(i=nCell-1; i>=0; i--){
  5439. u16 sz = aSize[i];
  5440. pCellptr -= 2;
  5441. cellbody -= sz;
  5442. put2byte(pCellptr, cellbody);
  5443. memcpy(&data[cellbody], apCell[i], sz);
  5444. }
  5445. put2byte(&data[hdr+3], nCell);
  5446. put2byte(&data[hdr+5], cellbody);
  5447. pPage->nFree -= (nCell*2 + nUsable - cellbody);
  5448. pPage->nCell = (u16)nCell;
  5449. }
  5450. /*
  5451. ** The following parameters determine how many adjacent pages get involved
  5452. ** in a balancing operation. NN is the number of neighbors on either side
  5453. ** of the page that participate in the balancing operation. NB is the
  5454. ** total number of pages that participate, including the target page and
  5455. ** NN neighbors on either side.
  5456. **
  5457. ** The minimum value of NN is 1 (of course). Increasing NN above 1
  5458. ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
  5459. ** in exchange for a larger degradation in INSERT and UPDATE performance.
  5460. ** The value of NN appears to give the best results overall.
  5461. */
  5462. #define NN 1 /* Number of neighbors on either side of pPage */
  5463. #define NB (NN*2+1) /* Total pages involved in the balance */
  5464. #ifndef SQLITE_OMIT_QUICKBALANCE
  5465. /*
  5466. ** This version of balance() handles the common special case where
  5467. ** a new entry is being inserted on the extreme right-end of the
  5468. ** tree, in other words, when the new entry will become the largest
  5469. ** entry in the tree.
  5470. **
  5471. ** Instead of trying to balance the 3 right-most leaf pages, just add
  5472. ** a new page to the right-hand side and put the one new entry in
  5473. ** that page. This leaves the right side of the tree somewhat
  5474. ** unbalanced. But odds are that we will be inserting new entries
  5475. ** at the end soon afterwards so the nearly empty page will quickly
  5476. ** fill up. On average.
  5477. **
  5478. ** pPage is the leaf page which is the right-most page in the tree.
  5479. ** pParent is its parent. pPage must have a single overflow entry
  5480. ** which is also the right-most entry on the page.
  5481. **
  5482. ** The pSpace buffer is used to store a temporary copy of the divider
  5483. ** cell that will be inserted into pParent. Such a cell consists of a 4
  5484. ** byte page number followed by a variable length integer. In other
  5485. ** words, at most 13 bytes. Hence the pSpace buffer must be at
  5486. ** least 13 bytes in size.
  5487. */
  5488. static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
  5489. BtShared *const pBt = pPage->pBt; /* B-Tree Database */
  5490. MemPage *pNew; /* Newly allocated page */
  5491. int rc; /* Return Code */
  5492. Pgno pgnoNew; /* Page number of pNew */
  5493. assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5494. assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  5495. assert( pPage->nOverflow==1 );
  5496. /* This error condition is now caught prior to reaching this function */
  5497. if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;
  5498. /* Allocate a new page. This page will become the right-sibling of
  5499. ** pPage. Make the parent page writable, so that the new divider cell
  5500. ** may be inserted. If both these operations are successful, proceed.
  5501. */
  5502. rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
  5503. if( rc==SQLITE_OK ){
  5504. u8 *pOut = &pSpace[4];
  5505. u8 *pCell = pPage->apOvfl[0];
  5506. u16 szCell = cellSizePtr(pPage, pCell);
  5507. u8 *pStop;
  5508. assert( sqlite3PagerIswriteable(pNew->pDbPage) );
  5509. assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
  5510. zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
  5511. assemblePage(pNew, 1, &pCell, &szCell);
  5512. /* If this is an auto-vacuum database, update the pointer map
  5513. ** with entries for the new page, and any pointer from the
  5514. ** cell on the page to an overflow page. If either of these
  5515. ** operations fails, the return code is set, but the contents
  5516. ** of the parent page are still manipulated by thh code below.
  5517. ** That is Ok, at this point the parent page is guaranteed to
  5518. ** be marked as dirty. Returning an error code will cause a
  5519. ** rollback, undoing any changes made to the parent page.
  5520. */
  5521. if( ISAUTOVACUUM ){
  5522. ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
  5523. if( szCell>pNew->minLocal ){
  5524. ptrmapPutOvflPtr(pNew, pCell, &rc);
  5525. }
  5526. }
  5527. /* Create a divider cell to insert into pParent. The divider cell
  5528. ** consists of a 4-byte page number (the page number of pPage) and
  5529. ** a variable length key value (which must be the same value as the
  5530. ** largest key on pPage).
  5531. **
  5532. ** To find the largest key value on pPage, first find the right-most
  5533. ** cell on pPage. The first two fields of this cell are the
  5534. ** record-length (a variable length integer at most 32-bits in size)
  5535. ** and the key value (a variable length integer, may have any value).
  5536. ** The first of the while(...) loops below skips over the record-length
  5537. ** field. The second while(...) loop copies the key value from the
  5538. ** cell on pPage into the pSpace buffer.
  5539. */
  5540. pCell = findCell(pPage, pPage->nCell-1);
  5541. pStop = &pCell[9];
  5542. while( (*(pCell++)&0x80) && pCell<pStop );
  5543. pStop = &pCell[9];
  5544. while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
  5545. /* Insert the new divider cell into pParent. */
  5546. insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
  5547. 0, pPage->pgno, &rc);
  5548. /* Set the right-child pointer of pParent to point to the new page. */
  5549. put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
  5550. /* Release the reference to the new page. */
  5551. releasePage(pNew);
  5552. }
  5553. return rc;
  5554. }
  5555. #endif /* SQLITE_OMIT_QUICKBALANCE */
  5556. #if 0
  5557. /*
  5558. ** This function does not contribute anything to the operation of SQLite.
  5559. ** it is sometimes activated temporarily while debugging code responsible
  5560. ** for setting pointer-map entries.
  5561. */
  5562. static int ptrmapCheckPages(MemPage **apPage, int nPage){
  5563. int i, j;
  5564. for(i=0; i<nPage; i++){
  5565. Pgno n;
  5566. u8 e;
  5567. MemPage *pPage = apPage[i];
  5568. BtShared *pBt = pPage->pBt;
  5569. assert( pPage->isInit );
  5570. for(j=0; j<pPage->nCell; j++){
  5571. CellInfo info;
  5572. u8 *z;
  5573. z = findCell(pPage, j);
  5574. btreeParseCellPtr(pPage, z, &info);
  5575. if( info.iOverflow ){
  5576. Pgno ovfl = get4byte(&z[info.iOverflow]);
  5577. ptrmapGet(pBt, ovfl, &e, &n);
  5578. assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
  5579. }
  5580. if( !pPage->leaf ){
  5581. Pgno child = get4byte(z);
  5582. ptrmapGet(pBt, child, &e, &n);
  5583. assert( n==pPage->pgno && e==PTRMAP_BTREE );
  5584. }
  5585. }
  5586. if( !pPage->leaf ){
  5587. Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  5588. ptrmapGet(pBt, child, &e, &n);
  5589. assert( n==pPage->pgno && e==PTRMAP_BTREE );
  5590. }
  5591. }
  5592. return 1;
  5593. }
  5594. #endif
  5595. /*
  5596. ** This function is used to copy the contents of the b-tree node stored
  5597. ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
  5598. ** the pointer-map entries for each child page are updated so that the
  5599. ** parent page stored in the pointer map is page pTo. If pFrom contained
  5600. ** any cells with overflow page pointers, then the corresponding pointer
  5601. ** map entries are also updated so that the parent page is page pTo.
  5602. **
  5603. ** If pFrom is currently carrying any overflow cells (entries in the
  5604. ** MemPage.apOvfl[] array), they are not copied to pTo.
  5605. **
  5606. ** Before returning, page pTo is reinitialized using btreeInitPage().
  5607. **
  5608. ** The performance of this function is not critical. It is only used by
  5609. ** the balance_shallower() and balance_deeper() procedures, neither of
  5610. ** which are called often under normal circumstances.
  5611. */
  5612. static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
  5613. if( (*pRC)==SQLITE_OK ){
  5614. BtShared * const pBt = pFrom->pBt;
  5615. u8 * const aFrom = pFrom->aData;
  5616. u8 * const aTo = pTo->aData;
  5617. int const iFromHdr = pFrom->hdrOffset;
  5618. int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
  5619. int rc;
  5620. int iData;
  5621. assert( pFrom->isInit );
  5622. assert( pFrom->nFree>=iToHdr );
  5623. assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
  5624. /* Copy the b-tree node content from page pFrom to page pTo. */
  5625. iData = get2byte(&aFrom[iFromHdr+5]);
  5626. memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
  5627. memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
  5628. /* Reinitialize page pTo so that the contents of the MemPage structure
  5629. ** match the new data. The initialization of pTo can actually fail under
  5630. ** fairly obscure circumstances, even though it is a copy of initialized
  5631. ** page pFrom.
  5632. */
  5633. pTo->isInit = 0;
  5634. rc = btreeInitPage(pTo);
  5635. if( rc!=SQLITE_OK ){
  5636. *pRC = rc;
  5637. return;
  5638. }
  5639. /* If this is an auto-vacuum database, update the pointer-map entries
  5640. ** for any b-tree or overflow pages that pTo now contains the pointers to.
  5641. */
  5642. if( ISAUTOVACUUM ){
  5643. *pRC = setChildPtrmaps(pTo);
  5644. }
  5645. }
  5646. }
  5647. /*
  5648. ** This routine redistributes cells on the iParentIdx'th child of pParent
  5649. ** (hereafter "the page") and up to 2 siblings so that all pages have about the
  5650. ** same amount of free space. Usually a single sibling on either side of the
  5651. ** page are used in the balancing, though both siblings might come from one
  5652. ** side if the page is the first or last child of its parent. If the page
  5653. ** has fewer than 2 siblings (something which can only happen if the page
  5654. ** is a root page or a child of a root page) then all available siblings
  5655. ** participate in the balancing.
  5656. **
  5657. ** The number of siblings of the page might be increased or decreased by
  5658. ** one or two in an effort to keep pages nearly full but not over full.
  5659. **
  5660. ** Note that when this routine is called, some of the cells on the page
  5661. ** might not actually be stored in MemPage.aData[]. This can happen
  5662. ** if the page is overfull. This routine ensures that all cells allocated
  5663. ** to the page and its siblings fit into MemPage.aData[] before returning.
  5664. **
  5665. ** In the course of balancing the page and its siblings, cells may be
  5666. ** inserted into or removed from the parent page (pParent). Doing so
  5667. ** may cause the parent page to become overfull or underfull. If this
  5668. ** happens, it is the responsibility of the caller to invoke the correct
  5669. ** balancing routine to fix this problem (see the balance() routine).
  5670. **
  5671. ** If this routine fails for any reason, it might leave the database
  5672. ** in a corrupted state. So if this routine fails, the database should
  5673. ** be rolled back.
  5674. **
  5675. ** The third argument to this function, aOvflSpace, is a pointer to a
  5676. ** buffer big enough to hold one page. If while inserting cells into the parent
  5677. ** page (pParent) the parent page becomes overfull, this buffer is
  5678. ** used to store the parent's overflow cells. Because this function inserts
  5679. ** a maximum of four divider cells into the parent page, and the maximum
  5680. ** size of a cell stored within an internal node is always less than 1/4
  5681. ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
  5682. ** enough for all overflow cells.
  5683. **
  5684. ** If aOvflSpace is set to a null pointer, this function returns
  5685. ** SQLITE_NOMEM.
  5686. */
  5687. #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
  5688. #pragma optimize("", off)
  5689. #endif
  5690. static int balance_nonroot(
  5691. MemPage *pParent, /* Parent page of siblings being balanced */
  5692. int iParentIdx, /* Index of "the page" in pParent */
  5693. u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */
  5694. int isRoot, /* True if pParent is a root-page */
  5695. int bBulk /* True if this call is part of a bulk load */
  5696. ){
  5697. BtShared *pBt; /* The whole database */
  5698. int nCell = 0; /* Number of cells in apCell[] */
  5699. int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
  5700. int nNew = 0; /* Number of pages in apNew[] */
  5701. int nOld; /* Number of pages in apOld[] */
  5702. int i, j, k; /* Loop counters */
  5703. int nxDiv; /* Next divider slot in pParent->aCell[] */
  5704. int rc = SQLITE_OK; /* The return code */
  5705. u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */
  5706. int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
  5707. int usableSpace; /* Bytes in pPage beyond the header */
  5708. int pageFlags; /* Value of pPage->aData[0] */
  5709. int subtotal; /* Subtotal of bytes in cells on one page */
  5710. int iSpace1 = 0; /* First unused byte of aSpace1[] */
  5711. int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */
  5712. int szScratch; /* Size of scratch memory requested */
  5713. MemPage *apOld[NB]; /* pPage and up to two siblings */
  5714. MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
  5715. MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
  5716. u8 *pRight; /* Location in parent of right-sibling pointer */
  5717. u8 *apDiv[NB-1]; /* Divider cells in pParent */
  5718. int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
  5719. int szNew[NB+2]; /* Combined size of cells place on i-th page */
  5720. u8 **apCell = 0; /* All cells begin balanced */
  5721. u16 *szCell; /* Local size of all cells in apCell[] */
  5722. u8 *aSpace1; /* Space for copies of dividers cells */
  5723. Pgno pgno; /* Temp var to store a page number in */
  5724. pBt = pParent->pBt;
  5725. assert( sqlite3_mutex_held(pBt->mutex) );
  5726. assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  5727. #if 0
  5728. TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
  5729. #endif
  5730. /* At this point pParent may have at most one overflow cell. And if
  5731. ** this overflow cell is present, it must be the cell with
  5732. ** index iParentIdx. This scenario comes about when this function
  5733. ** is called (indirectly) from sqlite3BtreeDelete().
  5734. */
  5735. assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
  5736. assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
  5737. if( !aOvflSpace ){
  5738. return SQLITE_NOMEM;
  5739. }
  5740. /* Find the sibling pages to balance. Also locate the cells in pParent
  5741. ** that divide the siblings. An attempt is made to find NN siblings on
  5742. ** either side of pPage. More siblings are taken from one side, however,
  5743. ** if there are fewer than NN siblings on the other side. If pParent
  5744. ** has NB or fewer children then all children of pParent are taken.
  5745. **
  5746. ** This loop also drops the divider cells from the parent page. This
  5747. ** way, the remainder of the function does not have to deal with any
  5748. ** overflow cells in the parent page, since if any existed they will
  5749. ** have already been removed.
  5750. */
  5751. i = pParent->nOverflow + pParent->nCell;
  5752. if( i<2 ){
  5753. nxDiv = 0;
  5754. }else{
  5755. assert( bBulk==0 || bBulk==1 );
  5756. if( iParentIdx==0 ){
  5757. nxDiv = 0;
  5758. }else if( iParentIdx==i ){
  5759. nxDiv = i-2+bBulk;
  5760. }else{
  5761. assert( bBulk==0 );
  5762. nxDiv = iParentIdx-1;
  5763. }
  5764. i = 2-bBulk;
  5765. }
  5766. nOld = i+1;
  5767. if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
  5768. pRight = &pParent->aData[pParent->hdrOffset+8];
  5769. }else{
  5770. pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
  5771. }
  5772. pgno = get4byte(pRight);
  5773. while( 1 ){
  5774. rc = getAndInitPage(pBt, pgno, &apOld[i], 0);
  5775. if( rc ){
  5776. memset(apOld, 0, (i+1)*sizeof(MemPage*));
  5777. goto balance_cleanup;
  5778. }
  5779. nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
  5780. if( (i--)==0 ) break;
  5781. if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){
  5782. apDiv[i] = pParent->apOvfl[0];
  5783. pgno = get4byte(apDiv[i]);
  5784. szNew[i] = cellSizePtr(pParent, apDiv[i]);
  5785. pParent->nOverflow = 0;
  5786. }else{
  5787. apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
  5788. pgno = get4byte(apDiv[i]);
  5789. szNew[i] = cellSizePtr(pParent, apDiv[i]);
  5790. /* Drop the cell from the parent page. apDiv[i] still points to
  5791. ** the cell within the parent, even though it has been dropped.
  5792. ** This is safe because dropping a cell only overwrites the first
  5793. ** four bytes of it, and this function does not need the first
  5794. ** four bytes of the divider cell. So the pointer is safe to use
  5795. ** later on.
  5796. **
  5797. ** But not if we are in secure-delete mode. In secure-delete mode,
  5798. ** the dropCell() routine will overwrite the entire cell with zeroes.
  5799. ** In this case, temporarily copy the cell into the aOvflSpace[]
  5800. ** buffer. It will be copied out again as soon as the aSpace[] buffer
  5801. ** is allocated. */
  5802. if( pBt->btsFlags & BTS_SECURE_DELETE ){
  5803. int iOff;
  5804. iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
  5805. if( (iOff+szNew[i])>(int)pBt->usableSize ){
  5806. rc = SQLITE_CORRUPT_BKPT;
  5807. memset(apOld, 0, (i+1)*sizeof(MemPage*));
  5808. goto balance_cleanup;
  5809. }else{
  5810. memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
  5811. apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
  5812. }
  5813. }
  5814. dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
  5815. }
  5816. }
  5817. /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
  5818. ** alignment */
  5819. nMaxCells = (nMaxCells + 3)&~3;
  5820. /*
  5821. ** Allocate space for memory structures
  5822. */
  5823. k = pBt->pageSize + ROUND8(sizeof(MemPage));
  5824. szScratch =
  5825. nMaxCells*sizeof(u8*) /* apCell */
  5826. + nMaxCells*sizeof(u16) /* szCell */
  5827. + pBt->pageSize /* aSpace1 */
  5828. + k*nOld; /* Page copies (apCopy) */
  5829. apCell = sqlite3ScratchMalloc( szScratch );
  5830. if( apCell==0 ){
  5831. rc = SQLITE_NOMEM;
  5832. goto balance_cleanup;
  5833. }
  5834. szCell = (u16*)&apCell[nMaxCells];
  5835. aSpace1 = (u8*)&szCell[nMaxCells];
  5836. assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
  5837. /*
  5838. ** Load pointers to all cells on sibling pages and the divider cells
  5839. ** into the local apCell[] array. Make copies of the divider cells
  5840. ** into space obtained from aSpace1[] and remove the divider cells
  5841. ** from pParent.
  5842. **
  5843. ** If the siblings are on leaf pages, then the child pointers of the
  5844. ** divider cells are stripped from the cells before they are copied
  5845. ** into aSpace1[]. In this way, all cells in apCell[] are without
  5846. ** child pointers. If siblings are not leaves, then all cell in
  5847. ** apCell[] include child pointers. Either way, all cells in apCell[]
  5848. ** are alike.
  5849. **
  5850. ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
  5851. ** leafData: 1 if pPage holds key+data and pParent holds only keys.
  5852. */
  5853. leafCorrection = apOld[0]->leaf*4;
  5854. leafData = apOld[0]->hasData;
  5855. for(i=0; i<nOld; i++){
  5856. int limit;
  5857. /* Before doing anything else, take a copy of the i'th original sibling
  5858. ** The rest of this function will use data from the copies rather
  5859. ** that the original pages since the original pages will be in the
  5860. ** process of being overwritten. */
  5861. MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
  5862. memcpy(pOld, apOld[i], sizeof(MemPage));
  5863. pOld->aData = (void*)&pOld[1];
  5864. memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);
  5865. limit = pOld->nCell+pOld->nOverflow;
  5866. if( pOld->nOverflow>0 ){
  5867. for(j=0; j<limit; j++){
  5868. assert( nCell<nMaxCells );
  5869. apCell[nCell] = findOverflowCell(pOld, j);
  5870. szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
  5871. nCell++;
  5872. }
  5873. }else{
  5874. u8 *aData = pOld->aData;
  5875. u16 maskPage = pOld->maskPage;
  5876. u16 cellOffset = pOld->cellOffset;
  5877. for(j=0; j<limit; j++){
  5878. assert( nCell<nMaxCells );
  5879. apCell[nCell] = findCellv2(aData, maskPage, cellOffset, j);
  5880. szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
  5881. nCell++;
  5882. }
  5883. }
  5884. if( i<nOld-1 && !leafData){
  5885. u16 sz = (u16)szNew[i];
  5886. u8 *pTemp;
  5887. assert( nCell<nMaxCells );
  5888. szCell[nCell] = sz;
  5889. pTemp = &aSpace1[iSpace1];
  5890. iSpace1 += sz;
  5891. assert( sz<=pBt->maxLocal+23 );
  5892. assert( iSpace1 <= (int)pBt->pageSize );
  5893. memcpy(pTemp, apDiv[i], sz);
  5894. apCell[nCell] = pTemp+leafCorrection;
  5895. assert( leafCorrection==0 || leafCorrection==4 );
  5896. szCell[nCell] = szCell[nCell] - leafCorrection;
  5897. if( !pOld->leaf ){
  5898. assert( leafCorrection==0 );
  5899. assert( pOld->hdrOffset==0 );
  5900. /* The right pointer of the child page pOld becomes the left
  5901. ** pointer of the divider cell */
  5902. memcpy(apCell[nCell], &pOld->aData[8], 4);
  5903. }else{
  5904. assert( leafCorrection==4 );
  5905. if( szCell[nCell]<4 ){
  5906. /* Do not allow any cells smaller than 4 bytes. */
  5907. szCell[nCell] = 4;
  5908. }
  5909. }
  5910. nCell++;
  5911. }
  5912. }
  5913. /*
  5914. ** Figure out the number of pages needed to hold all nCell cells.
  5915. ** Store this number in "k". Also compute szNew[] which is the total
  5916. ** size of all cells on the i-th page and cntNew[] which is the index
  5917. ** in apCell[] of the cell that divides page i from page i+1.
  5918. ** cntNew[k] should equal nCell.
  5919. **
  5920. ** Values computed by this block:
  5921. **
  5922. ** k: The total number of sibling pages
  5923. ** szNew[i]: Spaced used on the i-th sibling page.
  5924. ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
  5925. ** the right of the i-th sibling page.
  5926. ** usableSpace: Number of bytes of space available on each sibling.
  5927. **
  5928. */
  5929. usableSpace = pBt->usableSize - 12 + leafCorrection;
  5930. for(subtotal=k=i=0; i<nCell; i++){
  5931. assert( i<nMaxCells );
  5932. subtotal += szCell[i] + 2;
  5933. if( subtotal > usableSpace ){
  5934. szNew[k] = subtotal - szCell[i];
  5935. cntNew[k] = i;
  5936. if( leafData ){ i--; }
  5937. subtotal = 0;
  5938. k++;
  5939. if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
  5940. }
  5941. }
  5942. szNew[k] = subtotal;
  5943. cntNew[k] = nCell;
  5944. k++;
  5945. /*
  5946. ** The packing computed by the previous block is biased toward the siblings
  5947. ** on the left side. The left siblings are always nearly full, while the
  5948. ** right-most sibling might be nearly empty. This block of code attempts
  5949. ** to adjust the packing of siblings to get a better balance.
  5950. **
  5951. ** This adjustment is more than an optimization. The packing above might
  5952. ** be so out of balance as to be illegal. For example, the right-most
  5953. ** sibling might be completely empty. This adjustment is not optional.
  5954. */
  5955. for(i=k-1; i>0; i--){
  5956. int szRight = szNew[i]; /* Size of sibling on the right */
  5957. int szLeft = szNew[i-1]; /* Size of sibling on the left */
  5958. int r; /* Index of right-most cell in left sibling */
  5959. int d; /* Index of first cell to the left of right sibling */
  5960. r = cntNew[i-1] - 1;
  5961. d = r + 1 - leafData;
  5962. assert( d<nMaxCells );
  5963. assert( r<nMaxCells );
  5964. while( szRight==0
  5965. || (!bBulk && szRight+szCell[d]+2<=szLeft-(szCell[r]+2))
  5966. ){
  5967. szRight += szCell[d] + 2;
  5968. szLeft -= szCell[r] + 2;
  5969. cntNew[i-1]--;
  5970. r = cntNew[i-1] - 1;
  5971. d = r + 1 - leafData;
  5972. }
  5973. szNew[i] = szRight;
  5974. szNew[i-1] = szLeft;
  5975. }
  5976. /* Either we found one or more cells (cntnew[0])>0) or pPage is
  5977. ** a virtual root page. A virtual root page is when the real root
  5978. ** page is page 1 and we are the only child of that page.
  5979. **
  5980. ** UPDATE: The assert() below is not necessarily true if the database
  5981. ** file is corrupt. The corruption will be detected and reported later
  5982. ** in this procedure so there is no need to act upon it now.
  5983. */
  5984. #if 0
  5985. assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
  5986. #endif
  5987. TRACE(("BALANCE: old: %d %d %d ",
  5988. apOld[0]->pgno,
  5989. nOld>=2 ? apOld[1]->pgno : 0,
  5990. nOld>=3 ? apOld[2]->pgno : 0
  5991. ));
  5992. /*
  5993. ** Allocate k new pages. Reuse old pages where possible.
  5994. */
  5995. if( apOld[0]->pgno<=1 ){
  5996. rc = SQLITE_CORRUPT_BKPT;
  5997. goto balance_cleanup;
  5998. }
  5999. pageFlags = apOld[0]->aData[0];
  6000. for(i=0; i<k; i++){
  6001. MemPage *pNew;
  6002. if( i<nOld ){
  6003. pNew = apNew[i] = apOld[i];
  6004. apOld[i] = 0;
  6005. rc = sqlite3PagerWrite(pNew->pDbPage);
  6006. nNew++;
  6007. if( rc ) goto balance_cleanup;
  6008. }else{
  6009. assert( i>0 );
  6010. rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
  6011. if( rc ) goto balance_cleanup;
  6012. apNew[i] = pNew;
  6013. nNew++;
  6014. /* Set the pointer-map entry for the new sibling page. */
  6015. if( ISAUTOVACUUM ){
  6016. ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
  6017. if( rc!=SQLITE_OK ){
  6018. goto balance_cleanup;
  6019. }
  6020. }
  6021. }
  6022. }
  6023. /* Free any old pages that were not reused as new pages.
  6024. */
  6025. while( i<nOld ){
  6026. freePage(apOld[i], &rc);
  6027. if( rc ) goto balance_cleanup;
  6028. releasePage(apOld[i]);
  6029. apOld[i] = 0;
  6030. i++;
  6031. }
  6032. /*
  6033. ** Put the new pages in accending order. This helps to
  6034. ** keep entries in the disk file in order so that a scan
  6035. ** of the table is a linear scan through the file. That
  6036. ** in turn helps the operating system to deliver pages
  6037. ** from the disk more rapidly.
  6038. **
  6039. ** An O(n^2) insertion sort algorithm is used, but since
  6040. ** n is never more than NB (a small constant), that should
  6041. ** not be a problem.
  6042. **
  6043. ** When NB==3, this one optimization makes the database
  6044. ** about 25% faster for large insertions and deletions.
  6045. */
  6046. for(i=0; i<k-1; i++){
  6047. int minV = apNew[i]->pgno;
  6048. int minI = i;
  6049. for(j=i+1; j<k; j++){
  6050. if( apNew[j]->pgno<(unsigned)minV ){
  6051. minI = j;
  6052. minV = apNew[j]->pgno;
  6053. }
  6054. }
  6055. if( minI>i ){
  6056. MemPage *pT;
  6057. pT = apNew[i];
  6058. apNew[i] = apNew[minI];
  6059. apNew[minI] = pT;
  6060. }
  6061. }
  6062. TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
  6063. apNew[0]->pgno, szNew[0],
  6064. nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
  6065. nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
  6066. nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
  6067. nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));
  6068. assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  6069. put4byte(pRight, apNew[nNew-1]->pgno);
  6070. /*
  6071. ** Evenly distribute the data in apCell[] across the new pages.
  6072. ** Insert divider cells into pParent as necessary.
  6073. */
  6074. j = 0;
  6075. for(i=0; i<nNew; i++){
  6076. /* Assemble the new sibling page. */
  6077. MemPage *pNew = apNew[i];
  6078. assert( j<nMaxCells );
  6079. zeroPage(pNew, pageFlags);
  6080. assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
  6081. assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
  6082. assert( pNew->nOverflow==0 );
  6083. j = cntNew[i];
  6084. /* If the sibling page assembled above was not the right-most sibling,
  6085. ** insert a divider cell into the parent page.
  6086. */
  6087. assert( i<nNew-1 || j==nCell );
  6088. if( j<nCell ){
  6089. u8 *pCell;
  6090. u8 *pTemp;
  6091. int sz;
  6092. assert( j<nMaxCells );
  6093. pCell = apCell[j];
  6094. sz = szCell[j] + leafCorrection;
  6095. pTemp = &aOvflSpace[iOvflSpace];
  6096. if( !pNew->leaf ){
  6097. memcpy(&pNew->aData[8], pCell, 4);
  6098. }else if( leafData ){
  6099. /* If the tree is a leaf-data tree, and the siblings are leaves,
  6100. ** then there is no divider cell in apCell[]. Instead, the divider
  6101. ** cell consists of the integer key for the right-most cell of
  6102. ** the sibling-page assembled above only.
  6103. */
  6104. CellInfo info;
  6105. j--;
  6106. btreeParseCellPtr(pNew, apCell[j], &info);
  6107. pCell = pTemp;
  6108. sz = 4 + putVarint(&pCell[4], info.nKey);
  6109. pTemp = 0;
  6110. }else{
  6111. pCell -= 4;
  6112. /* Obscure case for non-leaf-data trees: If the cell at pCell was
  6113. ** previously stored on a leaf node, and its reported size was 4
  6114. ** bytes, then it may actually be smaller than this
  6115. ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
  6116. ** any cell). But it is important to pass the correct size to
  6117. ** insertCell(), so reparse the cell now.
  6118. **
  6119. ** Note that this can never happen in an SQLite data file, as all
  6120. ** cells are at least 4 bytes. It only happens in b-trees used
  6121. ** to evaluate "IN (SELECT ...)" and similar clauses.
  6122. */
  6123. if( szCell[j]==4 ){
  6124. assert(leafCorrection==4);
  6125. sz = cellSizePtr(pParent, pCell);
  6126. }
  6127. }
  6128. iOvflSpace += sz;
  6129. assert( sz<=pBt->maxLocal+23 );
  6130. assert( iOvflSpace <= (int)pBt->pageSize );
  6131. insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
  6132. if( rc!=SQLITE_OK ) goto balance_cleanup;
  6133. assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  6134. j++;
  6135. nxDiv++;
  6136. }
  6137. }
  6138. assert( j==nCell );
  6139. assert( nOld>0 );
  6140. assert( nNew>0 );
  6141. if( (pageFlags & PTF_LEAF)==0 ){
  6142. u8 *zChild = &apCopy[nOld-1]->aData[8];
  6143. memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
  6144. }
  6145. if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
  6146. /* The root page of the b-tree now contains no cells. The only sibling
  6147. ** page is the right-child of the parent. Copy the contents of the
  6148. ** child page into the parent, decreasing the overall height of the
  6149. ** b-tree structure by one. This is described as the "balance-shallower"
  6150. ** sub-algorithm in some documentation.
  6151. **
  6152. ** If this is an auto-vacuum database, the call to copyNodeContent()
  6153. ** sets all pointer-map entries corresponding to database image pages
  6154. ** for which the pointer is stored within the content being copied.
  6155. **
  6156. ** The second assert below verifies that the child page is defragmented
  6157. ** (it must be, as it was just reconstructed using assemblePage()). This
  6158. ** is important if the parent page happens to be page 1 of the database
  6159. ** image. */
  6160. assert( nNew==1 );
  6161. assert( apNew[0]->nFree ==
  6162. (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
  6163. );
  6164. copyNodeContent(apNew[0], pParent, &rc);
  6165. freePage(apNew[0], &rc);
  6166. }else if( ISAUTOVACUUM ){
  6167. /* Fix the pointer-map entries for all the cells that were shifted around.
  6168. ** There are several different types of pointer-map entries that need to
  6169. ** be dealt with by this routine. Some of these have been set already, but
  6170. ** many have not. The following is a summary:
  6171. **
  6172. ** 1) The entries associated with new sibling pages that were not
  6173. ** siblings when this function was called. These have already
  6174. ** been set. We don't need to worry about old siblings that were
  6175. ** moved to the free-list - the freePage() code has taken care
  6176. ** of those.
  6177. **
  6178. ** 2) The pointer-map entries associated with the first overflow
  6179. ** page in any overflow chains used by new divider cells. These
  6180. ** have also already been taken care of by the insertCell() code.
  6181. **
  6182. ** 3) If the sibling pages are not leaves, then the child pages of
  6183. ** cells stored on the sibling pages may need to be updated.
  6184. **
  6185. ** 4) If the sibling pages are not internal intkey nodes, then any
  6186. ** overflow pages used by these cells may need to be updated
  6187. ** (internal intkey nodes never contain pointers to overflow pages).
  6188. **
  6189. ** 5) If the sibling pages are not leaves, then the pointer-map
  6190. ** entries for the right-child pages of each sibling may need
  6191. ** to be updated.
  6192. **
  6193. ** Cases 1 and 2 are dealt with above by other code. The next
  6194. ** block deals with cases 3 and 4 and the one after that, case 5. Since
  6195. ** setting a pointer map entry is a relatively expensive operation, this
  6196. ** code only sets pointer map entries for child or overflow pages that have
  6197. ** actually moved between pages. */
  6198. MemPage *pNew = apNew[0];
  6199. MemPage *pOld = apCopy[0];
  6200. int nOverflow = pOld->nOverflow;
  6201. int iNextOld = pOld->nCell + nOverflow;
  6202. int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);
  6203. j = 0; /* Current 'old' sibling page */
  6204. k = 0; /* Current 'new' sibling page */
  6205. for(i=0; i<nCell; i++){
  6206. int isDivider = 0;
  6207. while( i==iNextOld ){
  6208. /* Cell i is the cell immediately following the last cell on old
  6209. ** sibling page j. If the siblings are not leaf pages of an
  6210. ** intkey b-tree, then cell i was a divider cell. */
  6211. assert( j+1 < ArraySize(apCopy) );
  6212. assert( j+1 < nOld );
  6213. pOld = apCopy[++j];
  6214. iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
  6215. if( pOld->nOverflow ){
  6216. nOverflow = pOld->nOverflow;
  6217. iOverflow = i + !leafData + pOld->aiOvfl[0];
  6218. }
  6219. isDivider = !leafData;
  6220. }
  6221. assert(nOverflow>0 || iOverflow<i );
  6222. assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);
  6223. assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);
  6224. if( i==iOverflow ){
  6225. isDivider = 1;
  6226. if( (--nOverflow)>0 ){
  6227. iOverflow++;
  6228. }
  6229. }
  6230. if( i==cntNew[k] ){
  6231. /* Cell i is the cell immediately following the last cell on new
  6232. ** sibling page k. If the siblings are not leaf pages of an
  6233. ** intkey b-tree, then cell i is a divider cell. */
  6234. pNew = apNew[++k];
  6235. if( !leafData ) continue;
  6236. }
  6237. assert( j<nOld );
  6238. assert( k<nNew );
  6239. /* If the cell was originally divider cell (and is not now) or
  6240. ** an overflow cell, or if the cell was located on a different sibling
  6241. ** page before the balancing, then the pointer map entries associated
  6242. ** with any child or overflow pages need to be updated. */
  6243. if( isDivider || pOld->pgno!=pNew->pgno ){
  6244. if( !leafCorrection ){
  6245. ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
  6246. }
  6247. if( szCell[i]>pNew->minLocal ){
  6248. ptrmapPutOvflPtr(pNew, apCell[i], &rc);
  6249. }
  6250. }
  6251. }
  6252. if( !leafCorrection ){
  6253. for(i=0; i<nNew; i++){
  6254. u32 key = get4byte(&apNew[i]->aData[8]);
  6255. ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
  6256. }
  6257. }
  6258. #if 0
  6259. /* The ptrmapCheckPages() contains assert() statements that verify that
  6260. ** all pointer map pages are set correctly. This is helpful while
  6261. ** debugging. This is usually disabled because a corrupt database may
  6262. ** cause an assert() statement to fail. */
  6263. ptrmapCheckPages(apNew, nNew);
  6264. ptrmapCheckPages(&pParent, 1);
  6265. #endif
  6266. }
  6267. assert( pParent->isInit );
  6268. TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
  6269. nOld, nNew, nCell));
  6270. /*
  6271. ** Cleanup before returning.
  6272. */
  6273. balance_cleanup:
  6274. sqlite3ScratchFree(apCell);
  6275. for(i=0; i<nOld; i++){
  6276. releasePage(apOld[i]);
  6277. }
  6278. for(i=0; i<nNew; i++){
  6279. releasePage(apNew[i]);
  6280. }
  6281. return rc;
  6282. }
  6283. #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
  6284. #pragma optimize("", on)
  6285. #endif
  6286. /*
  6287. ** This function is called when the root page of a b-tree structure is
  6288. ** overfull (has one or more overflow pages).
  6289. **
  6290. ** A new child page is allocated and the contents of the current root
  6291. ** page, including overflow cells, are copied into the child. The root
  6292. ** page is then overwritten to make it an empty page with the right-child
  6293. ** pointer pointing to the new page.
  6294. **
  6295. ** Before returning, all pointer-map entries corresponding to pages
  6296. ** that the new child-page now contains pointers to are updated. The
  6297. ** entry corresponding to the new right-child pointer of the root
  6298. ** page is also updated.
  6299. **
  6300. ** If successful, *ppChild is set to contain a reference to the child
  6301. ** page and SQLITE_OK is returned. In this case the caller is required
  6302. ** to call releasePage() on *ppChild exactly once. If an error occurs,
  6303. ** an error code is returned and *ppChild is set to 0.
  6304. */
  6305. static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
  6306. int rc; /* Return value from subprocedures */
  6307. MemPage *pChild = 0; /* Pointer to a new child page */
  6308. Pgno pgnoChild = 0; /* Page number of the new child page */
  6309. BtShared *pBt = pRoot->pBt; /* The BTree */
  6310. assert( pRoot->nOverflow>0 );
  6311. assert( sqlite3_mutex_held(pBt->mutex) );
  6312. /* Make pRoot, the root page of the b-tree, writable. Allocate a new
  6313. ** page that will become the new right-child of pPage. Copy the contents
  6314. ** of the node stored on pRoot into the new child page.
  6315. */
  6316. rc = sqlite3PagerWrite(pRoot->pDbPage);
  6317. if( rc==SQLITE_OK ){
  6318. rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
  6319. copyNodeContent(pRoot, pChild, &rc);
  6320. if( ISAUTOVACUUM ){
  6321. ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
  6322. }
  6323. }
  6324. if( rc ){
  6325. *ppChild = 0;
  6326. releasePage(pChild);
  6327. return rc;
  6328. }
  6329. assert( sqlite3PagerIswriteable(pChild->pDbPage) );
  6330. assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
  6331. assert( pChild->nCell==pRoot->nCell );
  6332. TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
  6333. /* Copy the overflow cells from pRoot to pChild */
  6334. memcpy(pChild->aiOvfl, pRoot->aiOvfl,
  6335. pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
  6336. memcpy(pChild->apOvfl, pRoot->apOvfl,
  6337. pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
  6338. pChild->nOverflow = pRoot->nOverflow;
  6339. /* Zero the contents of pRoot. Then install pChild as the right-child. */
  6340. zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
  6341. put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
  6342. *ppChild = pChild;
  6343. return SQLITE_OK;
  6344. }
  6345. /*
  6346. ** The page that pCur currently points to has just been modified in
  6347. ** some way. This function figures out if this modification means the
  6348. ** tree needs to be balanced, and if so calls the appropriate balancing
  6349. ** routine. Balancing routines are:
  6350. **
  6351. ** balance_quick()
  6352. ** balance_deeper()
  6353. ** balance_nonroot()
  6354. */
  6355. static int balance(BtCursor *pCur){
  6356. int rc = SQLITE_OK;
  6357. const int nMin = pCur->pBt->usableSize * 2 / 3;
  6358. u8 aBalanceQuickSpace[13];
  6359. u8 *pFree = 0;
  6360. TESTONLY( int balance_quick_called = 0 );
  6361. TESTONLY( int balance_deeper_called = 0 );
  6362. do {
  6363. int iPage = pCur->iPage;
  6364. MemPage *pPage = pCur->apPage[iPage];
  6365. if( iPage==0 ){
  6366. if( pPage->nOverflow ){
  6367. /* The root page of the b-tree is overfull. In this case call the
  6368. ** balance_deeper() function to create a new child for the root-page
  6369. ** and copy the current contents of the root-page to it. The
  6370. ** next iteration of the do-loop will balance the child page.
  6371. */
  6372. assert( (balance_deeper_called++)==0 );
  6373. rc = balance_deeper(pPage, &pCur->apPage[1]);
  6374. if( rc==SQLITE_OK ){
  6375. pCur->iPage = 1;
  6376. pCur->aiIdx[0] = 0;
  6377. pCur->aiIdx[1] = 0;
  6378. assert( pCur->apPage[1]->nOverflow );
  6379. }
  6380. }else{
  6381. break;
  6382. }
  6383. }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
  6384. break;
  6385. }else{
  6386. MemPage * const pParent = pCur->apPage[iPage-1];
  6387. int const iIdx = pCur->aiIdx[iPage-1];
  6388. rc = sqlite3PagerWrite(pParent->pDbPage);
  6389. if( rc==SQLITE_OK ){
  6390. #ifndef SQLITE_OMIT_QUICKBALANCE
  6391. if( pPage->hasData
  6392. && pPage->nOverflow==1
  6393. && pPage->aiOvfl[0]==pPage->nCell
  6394. && pParent->pgno!=1
  6395. && pParent->nCell==iIdx
  6396. ){
  6397. /* Call balance_quick() to create a new sibling of pPage on which
  6398. ** to store the overflow cell. balance_quick() inserts a new cell
  6399. ** into pParent, which may cause pParent overflow. If this
  6400. ** happens, the next interation of the do-loop will balance pParent
  6401. ** use either balance_nonroot() or balance_deeper(). Until this
  6402. ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
  6403. ** buffer.
  6404. **
  6405. ** The purpose of the following assert() is to check that only a
  6406. ** single call to balance_quick() is made for each call to this
  6407. ** function. If this were not verified, a subtle bug involving reuse
  6408. ** of the aBalanceQuickSpace[] might sneak in.
  6409. */
  6410. assert( (balance_quick_called++)==0 );
  6411. rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
  6412. }else
  6413. #endif
  6414. {
  6415. /* In this case, call balance_nonroot() to redistribute cells
  6416. ** between pPage and up to 2 of its sibling pages. This involves
  6417. ** modifying the contents of pParent, which may cause pParent to
  6418. ** become overfull or underfull. The next iteration of the do-loop
  6419. ** will balance the parent page to correct this.
  6420. **
  6421. ** If the parent page becomes overfull, the overflow cell or cells
  6422. ** are stored in the pSpace buffer allocated immediately below.
  6423. ** A subsequent iteration of the do-loop will deal with this by
  6424. ** calling balance_nonroot() (balance_deeper() may be called first,
  6425. ** but it doesn't deal with overflow cells - just moves them to a
  6426. ** different page). Once this subsequent call to balance_nonroot()
  6427. ** has completed, it is safe to release the pSpace buffer used by
  6428. ** the previous call, as the overflow cell data will have been
  6429. ** copied either into the body of a database page or into the new
  6430. ** pSpace buffer passed to the latter call to balance_nonroot().
  6431. */
  6432. u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
  6433. rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints);
  6434. if( pFree ){
  6435. /* If pFree is not NULL, it points to the pSpace buffer used
  6436. ** by a previous call to balance_nonroot(). Its contents are
  6437. ** now stored either on real database pages or within the
  6438. ** new pSpace buffer, so it may be safely freed here. */
  6439. sqlite3PageFree(pFree);
  6440. }
  6441. /* The pSpace buffer will be freed after the next call to
  6442. ** balance_nonroot(), or just before this function returns, whichever
  6443. ** comes first. */
  6444. pFree = pSpace;
  6445. }
  6446. }
  6447. pPage->nOverflow = 0;
  6448. /* The next iteration of the do-loop balances the parent page. */
  6449. releasePage(pPage);
  6450. pCur->iPage--;
  6451. }
  6452. }while( rc==SQLITE_OK );
  6453. if( pFree ){
  6454. sqlite3PageFree(pFree);
  6455. }
  6456. return rc;
  6457. }
  6458. /*
  6459. ** Insert a new record into the BTree. The key is given by (pKey,nKey)
  6460. ** and the data is given by (pData,nData). The cursor is used only to
  6461. ** define what table the record should be inserted into. The cursor
  6462. ** is left pointing at a random location.
  6463. **
  6464. ** For an INTKEY table, only the nKey value of the key is used. pKey is
  6465. ** ignored. For a ZERODATA table, the pData and nData are both ignored.
  6466. **
  6467. ** If the seekResult parameter is non-zero, then a successful call to
  6468. ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already
  6469. ** been performed. seekResult is the search result returned (a negative
  6470. ** number if pCur points at an entry that is smaller than (pKey, nKey), or
  6471. ** a positive value if pCur points at an etry that is larger than
  6472. ** (pKey, nKey)).
  6473. **
  6474. ** If the seekResult parameter is non-zero, then the caller guarantees that
  6475. ** cursor pCur is pointing at the existing copy of a row that is to be
  6476. ** overwritten. If the seekResult parameter is 0, then cursor pCur may
  6477. ** point to any entry or to no entry at all and so this function has to seek
  6478. ** the cursor before the new key can be inserted.
  6479. */
  6480. int sqlite3BtreeInsert(
  6481. BtCursor *pCur, /* Insert data into the table of this cursor */
  6482. const void *pKey, i64 nKey, /* The key of the new record */
  6483. const void *pData, int nData, /* The data of the new record */
  6484. int nZero, /* Number of extra 0 bytes to append to data */
  6485. int appendBias, /* True if this is likely an append */
  6486. int seekResult /* Result of prior MovetoUnpacked() call */
  6487. ){
  6488. int rc;
  6489. int loc = seekResult; /* -1: before desired location +1: after */
  6490. int szNew = 0;
  6491. int idx;
  6492. MemPage *pPage;
  6493. Btree *p = pCur->pBtree;
  6494. BtShared *pBt = p->pBt;
  6495. unsigned char *oldCell;
  6496. unsigned char *newCell = 0;
  6497. if( pCur->eState==CURSOR_FAULT ){
  6498. assert( pCur->skipNext!=SQLITE_OK );
  6499. return pCur->skipNext;
  6500. }
  6501. assert( cursorHoldsMutex(pCur) );
  6502. assert( pCur->wrFlag && pBt->inTransaction==TRANS_WRITE
  6503. && (pBt->btsFlags & BTS_READ_ONLY)==0 );
  6504. assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
  6505. /* Assert that the caller has been consistent. If this cursor was opened
  6506. ** expecting an index b-tree, then the caller should be inserting blob
  6507. ** keys with no associated data. If the cursor was opened expecting an
  6508. ** intkey table, the caller should be inserting integer keys with a
  6509. ** blob of associated data. */
  6510. assert( (pKey==0)==(pCur->pKeyInfo==0) );
  6511. /* Save the positions of any other cursors open on this table.
  6512. **
  6513. ** In some cases, the call to btreeMoveto() below is a no-op. For
  6514. ** example, when inserting data into a table with auto-generated integer
  6515. ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
  6516. ** integer key to use. It then calls this function to actually insert the
  6517. ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
  6518. ** that the cursor is already where it needs to be and returns without
  6519. ** doing any work. To avoid thwarting these optimizations, it is important
  6520. ** not to clear the cursor here.
  6521. */
  6522. rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
  6523. if( rc ) return rc;
  6524. /* If this is an insert into a table b-tree, invalidate any incrblob
  6525. ** cursors open on the row being replaced (assuming this is a replace
  6526. ** operation - if it is not, the following is a no-op). */
  6527. if( pCur->pKeyInfo==0 ){
  6528. invalidateIncrblobCursors(p, nKey, 0);
  6529. }
  6530. if( !loc ){
  6531. rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc);
  6532. if( rc ) return rc;
  6533. }
  6534. assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
  6535. pPage = pCur->apPage[pCur->iPage];
  6536. assert( pPage->intKey || nKey>=0 );
  6537. assert( pPage->leaf || !pPage->intKey );
  6538. TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
  6539. pCur->pgnoRoot, nKey, nData, pPage->pgno,
  6540. loc==0 ? "overwrite" : "new entry"));
  6541. assert( pPage->isInit );
  6542. allocateTempSpace(pBt);
  6543. newCell = pBt->pTmpSpace;
  6544. if( newCell==0 ) return SQLITE_NOMEM;
  6545. rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
  6546. if( rc ) goto end_insert;
  6547. assert( szNew==cellSizePtr(pPage, newCell) );
  6548. assert( szNew <= MX_CELL_SIZE(pBt) );
  6549. idx = pCur->aiIdx[pCur->iPage];
  6550. if( loc==0 ){
  6551. u16 szOld;
  6552. assert( idx<pPage->nCell );
  6553. rc = sqlite3PagerWrite(pPage->pDbPage);
  6554. if( rc ){
  6555. goto end_insert;
  6556. }
  6557. oldCell = findCell(pPage, idx);
  6558. if( !pPage->leaf ){
  6559. memcpy(newCell, oldCell, 4);
  6560. }
  6561. szOld = cellSizePtr(pPage, oldCell);
  6562. rc = clearCell(pPage, oldCell);
  6563. dropCell(pPage, idx, szOld, &rc);
  6564. if( rc ) goto end_insert;
  6565. }else if( loc<0 && pPage->nCell>0 ){
  6566. assert( pPage->leaf );
  6567. idx = ++pCur->aiIdx[pCur->iPage];
  6568. }else{
  6569. assert( pPage->leaf );
  6570. }
  6571. insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
  6572. assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
  6573. /* If no error has occurred and pPage has an overflow cell, call balance()
  6574. ** to redistribute the cells within the tree. Since balance() may move
  6575. ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey
  6576. ** variables.
  6577. **
  6578. ** Previous versions of SQLite called moveToRoot() to move the cursor
  6579. ** back to the root page as balance() used to invalidate the contents
  6580. ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
  6581. ** set the cursor state to "invalid". This makes common insert operations
  6582. ** slightly faster.
  6583. **
  6584. ** There is a subtle but important optimization here too. When inserting
  6585. ** multiple records into an intkey b-tree using a single cursor (as can
  6586. ** happen while processing an "INSERT INTO ... SELECT" statement), it
  6587. ** is advantageous to leave the cursor pointing to the last entry in
  6588. ** the b-tree if possible. If the cursor is left pointing to the last
  6589. ** entry in the table, and the next row inserted has an integer key
  6590. ** larger than the largest existing key, it is possible to insert the
  6591. ** row without seeking the cursor. This can be a big performance boost.
  6592. */
  6593. pCur->info.nSize = 0;
  6594. pCur->validNKey = 0;
  6595. if( rc==SQLITE_OK && pPage->nOverflow ){
  6596. rc = balance(pCur);
  6597. /* Must make sure nOverflow is reset to zero even if the balance()
  6598. ** fails. Internal data structure corruption will result otherwise.
  6599. ** Also, set the cursor state to invalid. This stops saveCursorPosition()
  6600. ** from trying to save the current position of the cursor. */
  6601. pCur->apPage[pCur->iPage]->nOverflow = 0;
  6602. pCur->eState = CURSOR_INVALID;
  6603. }
  6604. assert( pCur->apPage[pCur->iPage]->nOverflow==0 );
  6605. end_insert:
  6606. return rc;
  6607. }
  6608. /*
  6609. ** Delete the entry that the cursor is pointing to. The cursor
  6610. ** is left pointing at a arbitrary location.
  6611. */
  6612. int sqlite3BtreeDelete(BtCursor *pCur){
  6613. Btree *p = pCur->pBtree;
  6614. BtShared *pBt = p->pBt;
  6615. int rc; /* Return code */
  6616. MemPage *pPage; /* Page to delete cell from */
  6617. unsigned char *pCell; /* Pointer to cell to delete */
  6618. int iCellIdx; /* Index of cell to delete */
  6619. int iCellDepth; /* Depth of node containing pCell */
  6620. assert( cursorHoldsMutex(pCur) );
  6621. assert( pBt->inTransaction==TRANS_WRITE );
  6622. assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
  6623. assert( pCur->wrFlag );
  6624. assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
  6625. assert( !hasReadConflicts(p, pCur->pgnoRoot) );
  6626. if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell)
  6627. || NEVER(pCur->eState!=CURSOR_VALID)
  6628. ){
  6629. return SQLITE_ERROR; /* Something has gone awry. */
  6630. }
  6631. iCellDepth = pCur->iPage;
  6632. iCellIdx = pCur->aiIdx[iCellDepth];
  6633. pPage = pCur->apPage[iCellDepth];
  6634. pCell = findCell(pPage, iCellIdx);
  6635. /* If the page containing the entry to delete is not a leaf page, move
  6636. ** the cursor to the largest entry in the tree that is smaller than
  6637. ** the entry being deleted. This cell will replace the cell being deleted
  6638. ** from the internal node. The 'previous' entry is used for this instead
  6639. ** of the 'next' entry, as the previous entry is always a part of the
  6640. ** sub-tree headed by the child page of the cell being deleted. This makes
  6641. ** balancing the tree following the delete operation easier. */
  6642. if( !pPage->leaf ){
  6643. int notUsed;
  6644. rc = sqlite3BtreePrevious(pCur, &notUsed);
  6645. if( rc ) return rc;
  6646. }
  6647. /* Save the positions of any other cursors open on this table before
  6648. ** making any modifications. Make the page containing the entry to be
  6649. ** deleted writable. Then free any overflow pages associated with the
  6650. ** entry and finally remove the cell itself from within the page.
  6651. */
  6652. rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
  6653. if( rc ) return rc;
  6654. /* If this is a delete operation to remove a row from a table b-tree,
  6655. ** invalidate any incrblob cursors open on the row being deleted. */
  6656. if( pCur->pKeyInfo==0 ){
  6657. invalidateIncrblobCursors(p, pCur->info.nKey, 0);
  6658. }
  6659. rc = sqlite3PagerWrite(pPage->pDbPage);
  6660. if( rc ) return rc;
  6661. rc = clearCell(pPage, pCell);
  6662. dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell), &rc);
  6663. if( rc ) return rc;
  6664. /* If the cell deleted was not located on a leaf page, then the cursor
  6665. ** is currently pointing to the largest entry in the sub-tree headed
  6666. ** by the child-page of the cell that was just deleted from an internal
  6667. ** node. The cell from the leaf node needs to be moved to the internal
  6668. ** node to replace the deleted cell. */
  6669. if( !pPage->leaf ){
  6670. MemPage *pLeaf = pCur->apPage[pCur->iPage];
  6671. int nCell;
  6672. Pgno n = pCur->apPage[iCellDepth+1]->pgno;
  6673. unsigned char *pTmp;
  6674. pCell = findCell(pLeaf, pLeaf->nCell-1);
  6675. nCell = cellSizePtr(pLeaf, pCell);
  6676. assert( MX_CELL_SIZE(pBt) >= nCell );
  6677. allocateTempSpace(pBt);
  6678. pTmp = pBt->pTmpSpace;
  6679. rc = sqlite3PagerWrite(pLeaf->pDbPage);
  6680. insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
  6681. dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
  6682. if( rc ) return rc;
  6683. }
  6684. /* Balance the tree. If the entry deleted was located on a leaf page,
  6685. ** then the cursor still points to that page. In this case the first
  6686. ** call to balance() repairs the tree, and the if(...) condition is
  6687. ** never true.
  6688. **
  6689. ** Otherwise, if the entry deleted was on an internal node page, then
  6690. ** pCur is pointing to the leaf page from which a cell was removed to
  6691. ** replace the cell deleted from the internal node. This is slightly
  6692. ** tricky as the leaf node may be underfull, and the internal node may
  6693. ** be either under or overfull. In this case run the balancing algorithm
  6694. ** on the leaf node first. If the balance proceeds far enough up the
  6695. ** tree that we can be sure that any problem in the internal node has
  6696. ** been corrected, so be it. Otherwise, after balancing the leaf node,
  6697. ** walk the cursor up the tree to the internal node and balance it as
  6698. ** well. */
  6699. rc = balance(pCur);
  6700. if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
  6701. while( pCur->iPage>iCellDepth ){
  6702. releasePage(pCur->apPage[pCur->iPage--]);
  6703. }
  6704. rc = balance(pCur);
  6705. }
  6706. if( rc==SQLITE_OK ){
  6707. moveToRoot(pCur);
  6708. }
  6709. return rc;
  6710. }
  6711. /*
  6712. ** Create a new BTree table. Write into *piTable the page
  6713. ** number for the root page of the new table.
  6714. **
  6715. ** The type of type is determined by the flags parameter. Only the
  6716. ** following values of flags are currently in use. Other values for
  6717. ** flags might not work:
  6718. **
  6719. ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
  6720. ** BTREE_ZERODATA Used for SQL indices
  6721. */
  6722. static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
  6723. BtShared *pBt = p->pBt;
  6724. MemPage *pRoot;
  6725. Pgno pgnoRoot;
  6726. int rc;
  6727. int ptfFlags; /* Page-type flage for the root page of new table */
  6728. assert( sqlite3BtreeHoldsMutex(p) );
  6729. assert( pBt->inTransaction==TRANS_WRITE );
  6730. assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
  6731. #ifdef SQLITE_OMIT_AUTOVACUUM
  6732. rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
  6733. if( rc ){
  6734. return rc;
  6735. }
  6736. #else
  6737. if( pBt->autoVacuum ){
  6738. Pgno pgnoMove; /* Move a page here to make room for the root-page */
  6739. MemPage *pPageMove; /* The page to move to. */
  6740. /* Creating a new table may probably require moving an existing database
  6741. ** to make room for the new tables root page. In case this page turns
  6742. ** out to be an overflow page, delete all overflow page-map caches
  6743. ** held by open cursors.
  6744. */
  6745. invalidateAllOverflowCache(pBt);
  6746. /* Read the value of meta[3] from the database to determine where the
  6747. ** root page of the new table should go. meta[3] is the largest root-page
  6748. ** created so far, so the new root-page is (meta[3]+1).
  6749. */
  6750. sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
  6751. pgnoRoot++;
  6752. /* The new root-page may not be allocated on a pointer-map page, or the
  6753. ** PENDING_BYTE page.
  6754. */
  6755. while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
  6756. pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
  6757. pgnoRoot++;
  6758. }
  6759. assert( pgnoRoot>=3 );
  6760. /* Allocate a page. The page that currently resides at pgnoRoot will
  6761. ** be moved to the allocated page (unless the allocated page happens
  6762. ** to reside at pgnoRoot).
  6763. */
  6764. rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
  6765. if( rc!=SQLITE_OK ){
  6766. return rc;
  6767. }
  6768. if( pgnoMove!=pgnoRoot ){
  6769. /* pgnoRoot is the page that will be used for the root-page of
  6770. ** the new table (assuming an error did not occur). But we were
  6771. ** allocated pgnoMove. If required (i.e. if it was not allocated
  6772. ** by extending the file), the current page at position pgnoMove
  6773. ** is already journaled.
  6774. */
  6775. u8 eType = 0;
  6776. Pgno iPtrPage = 0;
  6777. /* Save the positions of any open cursors. This is required in
  6778. ** case they are holding a reference to an xFetch reference
  6779. ** corresponding to page pgnoRoot. */
  6780. rc = saveAllCursors(pBt, 0, 0);
  6781. releasePage(pPageMove);
  6782. if( rc!=SQLITE_OK ){
  6783. return rc;
  6784. }
  6785. /* Move the page currently at pgnoRoot to pgnoMove. */
  6786. rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
  6787. if( rc!=SQLITE_OK ){
  6788. return rc;
  6789. }
  6790. rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
  6791. if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
  6792. rc = SQLITE_CORRUPT_BKPT;
  6793. }
  6794. if( rc!=SQLITE_OK ){
  6795. releasePage(pRoot);
  6796. return rc;
  6797. }
  6798. assert( eType!=PTRMAP_ROOTPAGE );
  6799. assert( eType!=PTRMAP_FREEPAGE );
  6800. rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
  6801. releasePage(pRoot);
  6802. /* Obtain the page at pgnoRoot */
  6803. if( rc!=SQLITE_OK ){
  6804. return rc;
  6805. }
  6806. rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
  6807. if( rc!=SQLITE_OK ){
  6808. return rc;
  6809. }
  6810. rc = sqlite3PagerWrite(pRoot->pDbPage);
  6811. if( rc!=SQLITE_OK ){
  6812. releasePage(pRoot);
  6813. return rc;
  6814. }
  6815. }else{
  6816. pRoot = pPageMove;
  6817. }
  6818. /* Update the pointer-map and meta-data with the new root-page number. */
  6819. ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
  6820. if( rc ){
  6821. releasePage(pRoot);
  6822. return rc;
  6823. }
  6824. /* When the new root page was allocated, page 1 was made writable in
  6825. ** order either to increase the database filesize, or to decrement the
  6826. ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
  6827. */
  6828. assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
  6829. rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
  6830. if( NEVER(rc) ){
  6831. releasePage(pRoot);
  6832. return rc;
  6833. }
  6834. }else{
  6835. rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
  6836. if( rc ) return rc;
  6837. }
  6838. #endif
  6839. assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
  6840. if( createTabFlags & BTREE_INTKEY ){
  6841. ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
  6842. }else{
  6843. ptfFlags = PTF_ZERODATA | PTF_LEAF;
  6844. }
  6845. zeroPage(pRoot, ptfFlags);
  6846. sqlite3PagerUnref(pRoot->pDbPage);
  6847. assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
  6848. *piTable = (int)pgnoRoot;
  6849. return SQLITE_OK;
  6850. }
  6851. int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
  6852. int rc;
  6853. sqlite3BtreeEnter(p);
  6854. rc = btreeCreateTable(p, piTable, flags);
  6855. sqlite3BtreeLeave(p);
  6856. return rc;
  6857. }
  6858. /*
  6859. ** Erase the given database page and all its children. Return
  6860. ** the page to the freelist.
  6861. */
  6862. static int clearDatabasePage(
  6863. BtShared *pBt, /* The BTree that contains the table */
  6864. Pgno pgno, /* Page number to clear */
  6865. int freePageFlag, /* Deallocate page if true */
  6866. int *pnChange /* Add number of Cells freed to this counter */
  6867. ){
  6868. MemPage *pPage;
  6869. int rc;
  6870. unsigned char *pCell;
  6871. int i;
  6872. assert( sqlite3_mutex_held(pBt->mutex) );
  6873. if( pgno>btreePagecount(pBt) ){
  6874. return SQLITE_CORRUPT_BKPT;
  6875. }
  6876. rc = getAndInitPage(pBt, pgno, &pPage, 0);
  6877. if( rc ) return rc;
  6878. for(i=0; i<pPage->nCell; i++){
  6879. pCell = findCell(pPage, i);
  6880. if( !pPage->leaf ){
  6881. rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
  6882. if( rc ) goto cleardatabasepage_out;
  6883. }
  6884. rc = clearCell(pPage, pCell);
  6885. if( rc ) goto cleardatabasepage_out;
  6886. }
  6887. if( !pPage->leaf ){
  6888. rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), 1, pnChange);
  6889. if( rc ) goto cleardatabasepage_out;
  6890. }else if( pnChange ){
  6891. assert( pPage->intKey );
  6892. *pnChange += pPage->nCell;
  6893. }
  6894. if( freePageFlag ){
  6895. freePage(pPage, &rc);
  6896. }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
  6897. zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
  6898. }
  6899. cleardatabasepage_out:
  6900. releasePage(pPage);
  6901. return rc;
  6902. }
  6903. /*
  6904. ** Delete all information from a single table in the database. iTable is
  6905. ** the page number of the root of the table. After this routine returns,
  6906. ** the root page is empty, but still exists.
  6907. **
  6908. ** This routine will fail with SQLITE_LOCKED if there are any open
  6909. ** read cursors on the table. Open write cursors are moved to the
  6910. ** root of the table.
  6911. **
  6912. ** If pnChange is not NULL, then table iTable must be an intkey table. The
  6913. ** integer value pointed to by pnChange is incremented by the number of
  6914. ** entries in the table.
  6915. */
  6916. int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
  6917. int rc;
  6918. BtShared *pBt = p->pBt;
  6919. sqlite3BtreeEnter(p);
  6920. assert( p->inTrans==TRANS_WRITE );
  6921. rc = saveAllCursors(pBt, (Pgno)iTable, 0);
  6922. if( SQLITE_OK==rc ){
  6923. /* Invalidate all incrblob cursors open on table iTable (assuming iTable
  6924. ** is the root of a table b-tree - if it is not, the following call is
  6925. ** a no-op). */
  6926. invalidateIncrblobCursors(p, 0, 1);
  6927. rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
  6928. }
  6929. sqlite3BtreeLeave(p);
  6930. return rc;
  6931. }
  6932. /*
  6933. ** Erase all information in a table and add the root of the table to
  6934. ** the freelist. Except, the root of the principle table (the one on
  6935. ** page 1) is never added to the freelist.
  6936. **
  6937. ** This routine will fail with SQLITE_LOCKED if there are any open
  6938. ** cursors on the table.
  6939. **
  6940. ** If AUTOVACUUM is enabled and the page at iTable is not the last
  6941. ** root page in the database file, then the last root page
  6942. ** in the database file is moved into the slot formerly occupied by
  6943. ** iTable and that last slot formerly occupied by the last root page
  6944. ** is added to the freelist instead of iTable. In this say, all
  6945. ** root pages are kept at the beginning of the database file, which
  6946. ** is necessary for AUTOVACUUM to work right. *piMoved is set to the
  6947. ** page number that used to be the last root page in the file before
  6948. ** the move. If no page gets moved, *piMoved is set to 0.
  6949. ** The last root page is recorded in meta[3] and the value of
  6950. ** meta[3] is updated by this procedure.
  6951. */
  6952. static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
  6953. int rc;
  6954. MemPage *pPage = 0;
  6955. BtShared *pBt = p->pBt;
  6956. assert( sqlite3BtreeHoldsMutex(p) );
  6957. assert( p->inTrans==TRANS_WRITE );
  6958. /* It is illegal to drop a table if any cursors are open on the
  6959. ** database. This is because in auto-vacuum mode the backend may
  6960. ** need to move another root-page to fill a gap left by the deleted
  6961. ** root page. If an open cursor was using this page a problem would
  6962. ** occur.
  6963. **
  6964. ** This error is caught long before control reaches this point.
  6965. */
  6966. if( NEVER(pBt->pCursor) ){
  6967. sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db);
  6968. return SQLITE_LOCKED_SHAREDCACHE;
  6969. }
  6970. rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
  6971. if( rc ) return rc;
  6972. rc = sqlite3BtreeClearTable(p, iTable, 0);
  6973. if( rc ){
  6974. releasePage(pPage);
  6975. return rc;
  6976. }
  6977. *piMoved = 0;
  6978. if( iTable>1 ){
  6979. #ifdef SQLITE_OMIT_AUTOVACUUM
  6980. freePage(pPage, &rc);
  6981. releasePage(pPage);
  6982. #else
  6983. if( pBt->autoVacuum ){
  6984. Pgno maxRootPgno;
  6985. sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
  6986. if( iTable==maxRootPgno ){
  6987. /* If the table being dropped is the table with the largest root-page
  6988. ** number in the database, put the root page on the free list.
  6989. */
  6990. freePage(pPage, &rc);
  6991. releasePage(pPage);
  6992. if( rc!=SQLITE_OK ){
  6993. return rc;
  6994. }
  6995. }else{
  6996. /* The table being dropped does not have the largest root-page
  6997. ** number in the database. So move the page that does into the
  6998. ** gap left by the deleted root-page.
  6999. */
  7000. MemPage *pMove;
  7001. releasePage(pPage);
  7002. rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
  7003. if( rc!=SQLITE_OK ){
  7004. return rc;
  7005. }
  7006. rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
  7007. releasePage(pMove);
  7008. if( rc!=SQLITE_OK ){
  7009. return rc;
  7010. }
  7011. pMove = 0;
  7012. rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
  7013. freePage(pMove, &rc);
  7014. releasePage(pMove);
  7015. if( rc!=SQLITE_OK ){
  7016. return rc;
  7017. }
  7018. *piMoved = maxRootPgno;
  7019. }
  7020. /* Set the new 'max-root-page' value in the database header. This
  7021. ** is the old value less one, less one more if that happens to
  7022. ** be a root-page number, less one again if that is the
  7023. ** PENDING_BYTE_PAGE.
  7024. */
  7025. maxRootPgno--;
  7026. while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
  7027. || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
  7028. maxRootPgno--;
  7029. }
  7030. assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
  7031. rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
  7032. }else{
  7033. freePage(pPage, &rc);
  7034. releasePage(pPage);
  7035. }
  7036. #endif
  7037. }else{
  7038. /* If sqlite3BtreeDropTable was called on page 1.
  7039. ** This really never should happen except in a corrupt
  7040. ** database.
  7041. */
  7042. zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
  7043. releasePage(pPage);
  7044. }
  7045. return rc;
  7046. }
  7047. int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
  7048. int rc;
  7049. sqlite3BtreeEnter(p);
  7050. rc = btreeDropTable(p, iTable, piMoved);
  7051. sqlite3BtreeLeave(p);
  7052. return rc;
  7053. }
  7054. /*
  7055. ** This function may only be called if the b-tree connection already
  7056. ** has a read or write transaction open on the database.
  7057. **
  7058. ** Read the meta-information out of a database file. Meta[0]
  7059. ** is the number of free pages currently in the database. Meta[1]
  7060. ** through meta[15] are available for use by higher layers. Meta[0]
  7061. ** is read-only, the others are read/write.
  7062. **
  7063. ** The schema layer numbers meta values differently. At the schema
  7064. ** layer (and the SetCookie and ReadCookie opcodes) the number of
  7065. ** free pages is not visible. So Cookie[0] is the same as Meta[1].
  7066. */
  7067. void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
  7068. BtShared *pBt = p->pBt;
  7069. sqlite3BtreeEnter(p);
  7070. assert( p->inTrans>TRANS_NONE );
  7071. assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
  7072. assert( pBt->pPage1 );
  7073. assert( idx>=0 && idx<=15 );
  7074. *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
  7075. /* If auto-vacuum is disabled in this build and this is an auto-vacuum
  7076. ** database, mark the database as read-only. */
  7077. #ifdef SQLITE_OMIT_AUTOVACUUM
  7078. if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
  7079. pBt->btsFlags |= BTS_READ_ONLY;
  7080. }
  7081. #endif
  7082. sqlite3BtreeLeave(p);
  7083. }
  7084. /*
  7085. ** Write meta-information back into the database. Meta[0] is
  7086. ** read-only and may not be written.
  7087. */
  7088. int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
  7089. BtShared *pBt = p->pBt;
  7090. unsigned char *pP1;
  7091. int rc;
  7092. assert( idx>=1 && idx<=15 );
  7093. sqlite3BtreeEnter(p);
  7094. assert( p->inTrans==TRANS_WRITE );
  7095. assert( pBt->pPage1!=0 );
  7096. pP1 = pBt->pPage1->aData;
  7097. rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  7098. if( rc==SQLITE_OK ){
  7099. put4byte(&pP1[36 + idx*4], iMeta);
  7100. #ifndef SQLITE_OMIT_AUTOVACUUM
  7101. if( idx==BTREE_INCR_VACUUM ){
  7102. assert( pBt->autoVacuum || iMeta==0 );
  7103. assert( iMeta==0 || iMeta==1 );
  7104. pBt->incrVacuum = (u8)iMeta;
  7105. }
  7106. #endif
  7107. }
  7108. sqlite3BtreeLeave(p);
  7109. return rc;
  7110. }
  7111. #ifndef SQLITE_OMIT_BTREECOUNT
  7112. /*
  7113. ** The first argument, pCur, is a cursor opened on some b-tree. Count the
  7114. ** number of entries in the b-tree and write the result to *pnEntry.
  7115. **
  7116. ** SQLITE_OK is returned if the operation is successfully executed.
  7117. ** Otherwise, if an error is encountered (i.e. an IO error or database
  7118. ** corruption) an SQLite error code is returned.
  7119. */
  7120. int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
  7121. i64 nEntry = 0; /* Value to return in *pnEntry */
  7122. int rc; /* Return code */
  7123. if( pCur->pgnoRoot==0 ){
  7124. *pnEntry = 0;
  7125. return SQLITE_OK;
  7126. }
  7127. rc = moveToRoot(pCur);
  7128. /* Unless an error occurs, the following loop runs one iteration for each
  7129. ** page in the B-Tree structure (not including overflow pages).
  7130. */
  7131. while( rc==SQLITE_OK ){
  7132. int iIdx; /* Index of child node in parent */
  7133. MemPage *pPage; /* Current page of the b-tree */
  7134. /* If this is a leaf page or the tree is not an int-key tree, then
  7135. ** this page contains countable entries. Increment the entry counter
  7136. ** accordingly.
  7137. */
  7138. pPage = pCur->apPage[pCur->iPage];
  7139. if( pPage->leaf || !pPage->intKey ){
  7140. nEntry += pPage->nCell;
  7141. }
  7142. /* pPage is a leaf node. This loop navigates the cursor so that it
  7143. ** points to the first interior cell that it points to the parent of
  7144. ** the next page in the tree that has not yet been visited. The
  7145. ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
  7146. ** of the page, or to the number of cells in the page if the next page
  7147. ** to visit is the right-child of its parent.
  7148. **
  7149. ** If all pages in the tree have been visited, return SQLITE_OK to the
  7150. ** caller.
  7151. */
  7152. if( pPage->leaf ){
  7153. do {
  7154. if( pCur->iPage==0 ){
  7155. /* All pages of the b-tree have been visited. Return successfully. */
  7156. *pnEntry = nEntry;
  7157. return SQLITE_OK;
  7158. }
  7159. moveToParent(pCur);
  7160. }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell );
  7161. pCur->aiIdx[pCur->iPage]++;
  7162. pPage = pCur->apPage[pCur->iPage];
  7163. }
  7164. /* Descend to the child node of the cell that the cursor currently
  7165. ** points at. This is the right-child if (iIdx==pPage->nCell).
  7166. */
  7167. iIdx = pCur->aiIdx[pCur->iPage];
  7168. if( iIdx==pPage->nCell ){
  7169. rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
  7170. }else{
  7171. rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
  7172. }
  7173. }
  7174. /* An error has occurred. Return an error code. */
  7175. return rc;
  7176. }
  7177. #endif
  7178. /*
  7179. ** Return the pager associated with a BTree. This routine is used for
  7180. ** testing and debugging only.
  7181. */
  7182. Pager *sqlite3BtreePager(Btree *p){
  7183. return p->pBt->pPager;
  7184. }
  7185. #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  7186. /*
  7187. ** Append a message to the error message string.
  7188. */
  7189. static void checkAppendMsg(
  7190. IntegrityCk *pCheck,
  7191. char *zMsg1,
  7192. const char *zFormat,
  7193. ...
  7194. ){
  7195. va_list ap;
  7196. if( !pCheck->mxErr ) return;
  7197. pCheck->mxErr--;
  7198. pCheck->nErr++;
  7199. va_start(ap, zFormat);
  7200. if( pCheck->errMsg.nChar ){
  7201. sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
  7202. }
  7203. if( zMsg1 ){
  7204. sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
  7205. }
  7206. sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
  7207. va_end(ap);
  7208. if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
  7209. pCheck->mallocFailed = 1;
  7210. }
  7211. }
  7212. #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  7213. #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  7214. /*
  7215. ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
  7216. ** corresponds to page iPg is already set.
  7217. */
  7218. static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
  7219. assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
  7220. return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
  7221. }
  7222. /*
  7223. ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
  7224. */
  7225. static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
  7226. assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
  7227. pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
  7228. }
  7229. /*
  7230. ** Add 1 to the reference count for page iPage. If this is the second
  7231. ** reference to the page, add an error message to pCheck->zErrMsg.
  7232. ** Return 1 if there are 2 ore more references to the page and 0 if
  7233. ** if this is the first reference to the page.
  7234. **
  7235. ** Also check that the page number is in bounds.
  7236. */
  7237. static int checkRef(IntegrityCk *pCheck, Pgno iPage, char *zContext){
  7238. if( iPage==0 ) return 1;
  7239. if( iPage>pCheck->nPage ){
  7240. checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
  7241. return 1;
  7242. }
  7243. if( getPageReferenced(pCheck, iPage) ){
  7244. checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
  7245. return 1;
  7246. }
  7247. setPageReferenced(pCheck, iPage);
  7248. return 0;
  7249. }
  7250. #ifndef SQLITE_OMIT_AUTOVACUUM
  7251. /*
  7252. ** Check that the entry in the pointer-map for page iChild maps to
  7253. ** page iParent, pointer type ptrType. If not, append an error message
  7254. ** to pCheck.
  7255. */
  7256. static void checkPtrmap(
  7257. IntegrityCk *pCheck, /* Integrity check context */
  7258. Pgno iChild, /* Child page number */
  7259. u8 eType, /* Expected pointer map type */
  7260. Pgno iParent, /* Expected pointer map parent page number */
  7261. char *zContext /* Context description (used for error msg) */
  7262. ){
  7263. int rc;
  7264. u8 ePtrmapType;
  7265. Pgno iPtrmapParent;
  7266. rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
  7267. if( rc!=SQLITE_OK ){
  7268. if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
  7269. checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
  7270. return;
  7271. }
  7272. if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
  7273. checkAppendMsg(pCheck, zContext,
  7274. "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
  7275. iChild, eType, iParent, ePtrmapType, iPtrmapParent);
  7276. }
  7277. }
  7278. #endif
  7279. /*
  7280. ** Check the integrity of the freelist or of an overflow page list.
  7281. ** Verify that the number of pages on the list is N.
  7282. */
  7283. static void checkList(
  7284. IntegrityCk *pCheck, /* Integrity checking context */
  7285. int isFreeList, /* True for a freelist. False for overflow page list */
  7286. int iPage, /* Page number for first page in the list */
  7287. int N, /* Expected number of pages in the list */
  7288. char *zContext /* Context for error messages */
  7289. ){
  7290. int i;
  7291. int expected = N;
  7292. int iFirst = iPage;
  7293. while( N-- > 0 && pCheck->mxErr ){
  7294. DbPage *pOvflPage;
  7295. unsigned char *pOvflData;
  7296. if( iPage<1 ){
  7297. checkAppendMsg(pCheck, zContext,
  7298. "%d of %d pages missing from overflow list starting at %d",
  7299. N+1, expected, iFirst);
  7300. break;
  7301. }
  7302. if( checkRef(pCheck, iPage, zContext) ) break;
  7303. if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
  7304. checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
  7305. break;
  7306. }
  7307. pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
  7308. if( isFreeList ){
  7309. int n = get4byte(&pOvflData[4]);
  7310. #ifndef SQLITE_OMIT_AUTOVACUUM
  7311. if( pCheck->pBt->autoVacuum ){
  7312. checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
  7313. }
  7314. #endif
  7315. if( n>(int)pCheck->pBt->usableSize/4-2 ){
  7316. checkAppendMsg(pCheck, zContext,
  7317. "freelist leaf count too big on page %d", iPage);
  7318. N--;
  7319. }else{
  7320. for(i=0; i<n; i++){
  7321. Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
  7322. #ifndef SQLITE_OMIT_AUTOVACUUM
  7323. if( pCheck->pBt->autoVacuum ){
  7324. checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
  7325. }
  7326. #endif
  7327. checkRef(pCheck, iFreePage, zContext);
  7328. }
  7329. N -= n;
  7330. }
  7331. }
  7332. #ifndef SQLITE_OMIT_AUTOVACUUM
  7333. else{
  7334. /* If this database supports auto-vacuum and iPage is not the last
  7335. ** page in this overflow list, check that the pointer-map entry for
  7336. ** the following page matches iPage.
  7337. */
  7338. if( pCheck->pBt->autoVacuum && N>0 ){
  7339. i = get4byte(pOvflData);
  7340. checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
  7341. }
  7342. }
  7343. #endif
  7344. iPage = get4byte(pOvflData);
  7345. sqlite3PagerUnref(pOvflPage);
  7346. }
  7347. }
  7348. #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  7349. #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  7350. /*
  7351. ** Do various sanity checks on a single page of a tree. Return
  7352. ** the tree depth. Root pages return 0. Parents of root pages
  7353. ** return 1, and so forth.
  7354. **
  7355. ** These checks are done:
  7356. **
  7357. ** 1. Make sure that cells and freeblocks do not overlap
  7358. ** but combine to completely cover the page.
  7359. ** NO 2. Make sure cell keys are in order.
  7360. ** NO 3. Make sure no key is less than or equal to zLowerBound.
  7361. ** NO 4. Make sure no key is greater than or equal to zUpperBound.
  7362. ** 5. Check the integrity of overflow pages.
  7363. ** 6. Recursively call checkTreePage on all children.
  7364. ** 7. Verify that the depth of all children is the same.
  7365. ** 8. Make sure this page is at least 33% full or else it is
  7366. ** the root of the tree.
  7367. */
  7368. static int checkTreePage(
  7369. IntegrityCk *pCheck, /* Context for the sanity check */
  7370. int iPage, /* Page number of the page to check */
  7371. char *zParentContext, /* Parent context */
  7372. i64 *pnParentMinKey,
  7373. i64 *pnParentMaxKey
  7374. ){
  7375. MemPage *pPage;
  7376. int i, rc, depth, d2, pgno, cnt;
  7377. int hdr, cellStart;
  7378. int nCell;
  7379. u8 *data;
  7380. BtShared *pBt;
  7381. int usableSize;
  7382. char zContext[100];
  7383. char *hit = 0;
  7384. i64 nMinKey = 0;
  7385. i64 nMaxKey = 0;
  7386. sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
  7387. /* Check that the page exists
  7388. */
  7389. pBt = pCheck->pBt;
  7390. usableSize = pBt->usableSize;
  7391. if( iPage==0 ) return 0;
  7392. if( checkRef(pCheck, iPage, zParentContext) ) return 0;
  7393. if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
  7394. checkAppendMsg(pCheck, zContext,
  7395. "unable to get the page. error code=%d", rc);
  7396. return 0;
  7397. }
  7398. /* Clear MemPage.isInit to make sure the corruption detection code in
  7399. ** btreeInitPage() is executed. */
  7400. pPage->isInit = 0;
  7401. if( (rc = btreeInitPage(pPage))!=0 ){
  7402. assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */
  7403. checkAppendMsg(pCheck, zContext,
  7404. "btreeInitPage() returns error code %d", rc);
  7405. releasePage(pPage);
  7406. return 0;
  7407. }
  7408. /* Check out all the cells.
  7409. */
  7410. depth = 0;
  7411. for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
  7412. u8 *pCell;
  7413. u32 sz;
  7414. CellInfo info;
  7415. /* Check payload overflow pages
  7416. */
  7417. sqlite3_snprintf(sizeof(zContext), zContext,
  7418. "On tree page %d cell %d: ", iPage, i);
  7419. pCell = findCell(pPage,i);
  7420. btreeParseCellPtr(pPage, pCell, &info);
  7421. sz = info.nData;
  7422. if( !pPage->intKey ) sz += (int)info.nKey;
  7423. /* For intKey pages, check that the keys are in order.
  7424. */
  7425. else if( i==0 ) nMinKey = nMaxKey = info.nKey;
  7426. else{
  7427. if( info.nKey <= nMaxKey ){
  7428. checkAppendMsg(pCheck, zContext,
  7429. "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey);
  7430. }
  7431. nMaxKey = info.nKey;
  7432. }
  7433. assert( sz==info.nPayload );
  7434. if( (sz>info.nLocal)
  7435. && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize])
  7436. ){
  7437. int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
  7438. Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
  7439. #ifndef SQLITE_OMIT_AUTOVACUUM
  7440. if( pBt->autoVacuum ){
  7441. checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
  7442. }
  7443. #endif
  7444. checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
  7445. }
  7446. /* Check sanity of left child page.
  7447. */
  7448. if( !pPage->leaf ){
  7449. pgno = get4byte(pCell);
  7450. #ifndef SQLITE_OMIT_AUTOVACUUM
  7451. if( pBt->autoVacuum ){
  7452. checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
  7453. }
  7454. #endif
  7455. d2 = checkTreePage(pCheck, pgno, zContext, &nMinKey, i==0 ? NULL : &nMaxKey);
  7456. if( i>0 && d2!=depth ){
  7457. checkAppendMsg(pCheck, zContext, "Child page depth differs");
  7458. }
  7459. depth = d2;
  7460. }
  7461. }
  7462. if( !pPage->leaf ){
  7463. pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  7464. sqlite3_snprintf(sizeof(zContext), zContext,
  7465. "On page %d at right child: ", iPage);
  7466. #ifndef SQLITE_OMIT_AUTOVACUUM
  7467. if( pBt->autoVacuum ){
  7468. checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
  7469. }
  7470. #endif
  7471. checkTreePage(pCheck, pgno, zContext, NULL, !pPage->nCell ? NULL : &nMaxKey);
  7472. }
  7473. /* For intKey leaf pages, check that the min/max keys are in order
  7474. ** with any left/parent/right pages.
  7475. */
  7476. if( pPage->leaf && pPage->intKey ){
  7477. /* if we are a left child page */
  7478. if( pnParentMinKey ){
  7479. /* if we are the left most child page */
  7480. if( !pnParentMaxKey ){
  7481. if( nMaxKey > *pnParentMinKey ){
  7482. checkAppendMsg(pCheck, zContext,
  7483. "Rowid %lld out of order (max larger than parent min of %lld)",
  7484. nMaxKey, *pnParentMinKey);
  7485. }
  7486. }else{
  7487. if( nMinKey <= *pnParentMinKey ){
  7488. checkAppendMsg(pCheck, zContext,
  7489. "Rowid %lld out of order (min less than parent min of %lld)",
  7490. nMinKey, *pnParentMinKey);
  7491. }
  7492. if( nMaxKey > *pnParentMaxKey ){
  7493. checkAppendMsg(pCheck, zContext,
  7494. "Rowid %lld out of order (max larger than parent max of %lld)",
  7495. nMaxKey, *pnParentMaxKey);
  7496. }
  7497. *pnParentMinKey = nMaxKey;
  7498. }
  7499. /* else if we're a right child page */
  7500. } else if( pnParentMaxKey ){
  7501. if( nMinKey <= *pnParentMaxKey ){
  7502. checkAppendMsg(pCheck, zContext,
  7503. "Rowid %lld out of order (min less than parent max of %lld)",
  7504. nMinKey, *pnParentMaxKey);
  7505. }
  7506. }
  7507. }
  7508. /* Check for complete coverage of the page
  7509. */
  7510. data = pPage->aData;
  7511. hdr = pPage->hdrOffset;
  7512. hit = sqlite3PageMalloc( pBt->pageSize );
  7513. if( hit==0 ){
  7514. pCheck->mallocFailed = 1;
  7515. }else{
  7516. int contentOffset = get2byteNotZero(&data[hdr+5]);
  7517. assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */
  7518. memset(hit+contentOffset, 0, usableSize-contentOffset);
  7519. memset(hit, 1, contentOffset);
  7520. nCell = get2byte(&data[hdr+3]);
  7521. cellStart = hdr + 12 - 4*pPage->leaf;
  7522. for(i=0; i<nCell; i++){
  7523. int pc = get2byte(&data[cellStart+i*2]);
  7524. u32 size = 65536;
  7525. int j;
  7526. if( pc<=usableSize-4 ){
  7527. size = cellSizePtr(pPage, &data[pc]);
  7528. }
  7529. if( (int)(pc+size-1)>=usableSize ){
  7530. checkAppendMsg(pCheck, 0,
  7531. "Corruption detected in cell %d on page %d",i,iPage);
  7532. }else{
  7533. for(j=pc+size-1; j>=pc; j--) hit[j]++;
  7534. }
  7535. }
  7536. i = get2byte(&data[hdr+1]);
  7537. while( i>0 ){
  7538. int size, j;
  7539. assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */
  7540. size = get2byte(&data[i+2]);
  7541. assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */
  7542. for(j=i+size-1; j>=i; j--) hit[j]++;
  7543. j = get2byte(&data[i]);
  7544. assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */
  7545. assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */
  7546. i = j;
  7547. }
  7548. for(i=cnt=0; i<usableSize; i++){
  7549. if( hit[i]==0 ){
  7550. cnt++;
  7551. }else if( hit[i]>1 ){
  7552. checkAppendMsg(pCheck, 0,
  7553. "Multiple uses for byte %d of page %d", i, iPage);
  7554. break;
  7555. }
  7556. }
  7557. if( cnt!=data[hdr+7] ){
  7558. checkAppendMsg(pCheck, 0,
  7559. "Fragmentation of %d bytes reported as %d on page %d",
  7560. cnt, data[hdr+7], iPage);
  7561. }
  7562. }
  7563. sqlite3PageFree(hit);
  7564. releasePage(pPage);
  7565. return depth+1;
  7566. }
  7567. #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  7568. #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  7569. /*
  7570. ** This routine does a complete check of the given BTree file. aRoot[] is
  7571. ** an array of pages numbers were each page number is the root page of
  7572. ** a table. nRoot is the number of entries in aRoot.
  7573. **
  7574. ** A read-only or read-write transaction must be opened before calling
  7575. ** this function.
  7576. **
  7577. ** Write the number of error seen in *pnErr. Except for some memory
  7578. ** allocation errors, an error message held in memory obtained from
  7579. ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
  7580. ** returned. If a memory allocation error occurs, NULL is returned.
  7581. */
  7582. char *sqlite3BtreeIntegrityCheck(
  7583. Btree *p, /* The btree to be checked */
  7584. int *aRoot, /* An array of root pages numbers for individual trees */
  7585. int nRoot, /* Number of entries in aRoot[] */
  7586. int mxErr, /* Stop reporting errors after this many */
  7587. int *pnErr /* Write number of errors seen to this variable */
  7588. ){
  7589. Pgno i;
  7590. int nRef;
  7591. IntegrityCk sCheck;
  7592. BtShared *pBt = p->pBt;
  7593. char zErr[100];
  7594. sqlite3BtreeEnter(p);
  7595. assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
  7596. nRef = sqlite3PagerRefcount(pBt->pPager);
  7597. sCheck.pBt = pBt;
  7598. sCheck.pPager = pBt->pPager;
  7599. sCheck.nPage = btreePagecount(sCheck.pBt);
  7600. sCheck.mxErr = mxErr;
  7601. sCheck.nErr = 0;
  7602. sCheck.mallocFailed = 0;
  7603. *pnErr = 0;
  7604. if( sCheck.nPage==0 ){
  7605. sqlite3BtreeLeave(p);
  7606. return 0;
  7607. }
  7608. sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
  7609. if( !sCheck.aPgRef ){
  7610. *pnErr = 1;
  7611. sqlite3BtreeLeave(p);
  7612. return 0;
  7613. }
  7614. i = PENDING_BYTE_PAGE(pBt);
  7615. if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
  7616. sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
  7617. sCheck.errMsg.useMalloc = 2;
  7618. /* Check the integrity of the freelist
  7619. */
  7620. checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
  7621. get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
  7622. /* Check all the tables.
  7623. */
  7624. for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
  7625. if( aRoot[i]==0 ) continue;
  7626. #ifndef SQLITE_OMIT_AUTOVACUUM
  7627. if( pBt->autoVacuum && aRoot[i]>1 ){
  7628. checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
  7629. }
  7630. #endif
  7631. checkTreePage(&sCheck, aRoot[i], "List of tree roots: ", NULL, NULL);
  7632. }
  7633. /* Make sure every page in the file is referenced
  7634. */
  7635. for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
  7636. #ifdef SQLITE_OMIT_AUTOVACUUM
  7637. if( getPageReferenced(&sCheck, i)==0 ){
  7638. checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
  7639. }
  7640. #else
  7641. /* If the database supports auto-vacuum, make sure no tables contain
  7642. ** references to pointer-map pages.
  7643. */
  7644. if( getPageReferenced(&sCheck, i)==0 &&
  7645. (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
  7646. checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
  7647. }
  7648. if( getPageReferenced(&sCheck, i)!=0 &&
  7649. (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
  7650. checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
  7651. }
  7652. #endif
  7653. }
  7654. /* Make sure this analysis did not leave any unref() pages.
  7655. ** This is an internal consistency check; an integrity check
  7656. ** of the integrity check.
  7657. */
  7658. if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){
  7659. checkAppendMsg(&sCheck, 0,
  7660. "Outstanding page count goes from %d to %d during this analysis",
  7661. nRef, sqlite3PagerRefcount(pBt->pPager)
  7662. );
  7663. }
  7664. /* Clean up and report errors.
  7665. */
  7666. sqlite3BtreeLeave(p);
  7667. sqlite3_free(sCheck.aPgRef);
  7668. if( sCheck.mallocFailed ){
  7669. sqlite3StrAccumReset(&sCheck.errMsg);
  7670. *pnErr = sCheck.nErr+1;
  7671. return 0;
  7672. }
  7673. *pnErr = sCheck.nErr;
  7674. if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
  7675. return sqlite3StrAccumFinish(&sCheck.errMsg);
  7676. }
  7677. #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  7678. /*
  7679. ** Return the full pathname of the underlying database file. Return
  7680. ** an empty string if the database is in-memory or a TEMP database.
  7681. **
  7682. ** The pager filename is invariant as long as the pager is
  7683. ** open so it is safe to access without the BtShared mutex.
  7684. */
  7685. const char *sqlite3BtreeGetFilename(Btree *p){
  7686. assert( p->pBt->pPager!=0 );
  7687. return sqlite3PagerFilename(p->pBt->pPager, 1);
  7688. }
  7689. /*
  7690. ** Return the pathname of the journal file for this database. The return
  7691. ** value of this routine is the same regardless of whether the journal file
  7692. ** has been created or not.
  7693. **
  7694. ** The pager journal filename is invariant as long as the pager is
  7695. ** open so it is safe to access without the BtShared mutex.
  7696. */
  7697. const char *sqlite3BtreeGetJournalname(Btree *p){
  7698. assert( p->pBt->pPager!=0 );
  7699. return sqlite3PagerJournalname(p->pBt->pPager);
  7700. }
  7701. /*
  7702. ** Return non-zero if a transaction is active.
  7703. */
  7704. int sqlite3BtreeIsInTrans(Btree *p){
  7705. assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
  7706. return (p && (p->inTrans==TRANS_WRITE));
  7707. }
  7708. #ifndef SQLITE_OMIT_WAL
  7709. /*
  7710. ** Run a checkpoint on the Btree passed as the first argument.
  7711. **
  7712. ** Return SQLITE_LOCKED if this or any other connection has an open
  7713. ** transaction on the shared-cache the argument Btree is connected to.
  7714. **
  7715. ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
  7716. */
  7717. int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
  7718. int rc = SQLITE_OK;
  7719. if( p ){
  7720. BtShared *pBt = p->pBt;
  7721. sqlite3BtreeEnter(p);
  7722. if( pBt->inTransaction!=TRANS_NONE ){
  7723. rc = SQLITE_LOCKED;
  7724. }else{
  7725. rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt);
  7726. }
  7727. sqlite3BtreeLeave(p);
  7728. }
  7729. return rc;
  7730. }
  7731. #endif
  7732. /*
  7733. ** Return non-zero if a read (or write) transaction is active.
  7734. */
  7735. int sqlite3BtreeIsInReadTrans(Btree *p){
  7736. assert( p );
  7737. assert( sqlite3_mutex_held(p->db->mutex) );
  7738. return p->inTrans!=TRANS_NONE;
  7739. }
  7740. int sqlite3BtreeIsInBackup(Btree *p){
  7741. assert( p );
  7742. assert( sqlite3_mutex_held(p->db->mutex) );
  7743. return p->nBackup!=0;
  7744. }
  7745. /*
  7746. ** This function returns a pointer to a blob of memory associated with
  7747. ** a single shared-btree. The memory is used by client code for its own
  7748. ** purposes (for example, to store a high-level schema associated with
  7749. ** the shared-btree). The btree layer manages reference counting issues.
  7750. **
  7751. ** The first time this is called on a shared-btree, nBytes bytes of memory
  7752. ** are allocated, zeroed, and returned to the caller. For each subsequent
  7753. ** call the nBytes parameter is ignored and a pointer to the same blob
  7754. ** of memory returned.
  7755. **
  7756. ** If the nBytes parameter is 0 and the blob of memory has not yet been
  7757. ** allocated, a null pointer is returned. If the blob has already been
  7758. ** allocated, it is returned as normal.
  7759. **
  7760. ** Just before the shared-btree is closed, the function passed as the
  7761. ** xFree argument when the memory allocation was made is invoked on the
  7762. ** blob of allocated memory. The xFree function should not call sqlite3_free()
  7763. ** on the memory, the btree layer does that.
  7764. */
  7765. void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
  7766. BtShared *pBt = p->pBt;
  7767. sqlite3BtreeEnter(p);
  7768. if( !pBt->pSchema && nBytes ){
  7769. pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
  7770. pBt->xFreeSchema = xFree;
  7771. }
  7772. sqlite3BtreeLeave(p);
  7773. return pBt->pSchema;
  7774. }
  7775. /*
  7776. ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
  7777. ** btree as the argument handle holds an exclusive lock on the
  7778. ** sqlite_master table. Otherwise SQLITE_OK.
  7779. */
  7780. int sqlite3BtreeSchemaLocked(Btree *p){
  7781. int rc;
  7782. assert( sqlite3_mutex_held(p->db->mutex) );
  7783. sqlite3BtreeEnter(p);
  7784. rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
  7785. assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
  7786. sqlite3BtreeLeave(p);
  7787. return rc;
  7788. }
  7789. #ifndef SQLITE_OMIT_SHARED_CACHE
  7790. /*
  7791. ** Obtain a lock on the table whose root page is iTab. The
  7792. ** lock is a write lock if isWritelock is true or a read lock
  7793. ** if it is false.
  7794. */
  7795. int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
  7796. int rc = SQLITE_OK;
  7797. assert( p->inTrans!=TRANS_NONE );
  7798. if( p->sharable ){
  7799. u8 lockType = READ_LOCK + isWriteLock;
  7800. assert( READ_LOCK+1==WRITE_LOCK );
  7801. assert( isWriteLock==0 || isWriteLock==1 );
  7802. sqlite3BtreeEnter(p);
  7803. rc = querySharedCacheTableLock(p, iTab, lockType);
  7804. if( rc==SQLITE_OK ){
  7805. rc = setSharedCacheTableLock(p, iTab, lockType);
  7806. }
  7807. sqlite3BtreeLeave(p);
  7808. }
  7809. return rc;
  7810. }
  7811. #endif
  7812. #ifndef SQLITE_OMIT_INCRBLOB
  7813. /*
  7814. ** Argument pCsr must be a cursor opened for writing on an
  7815. ** INTKEY table currently pointing at a valid table entry.
  7816. ** This function modifies the data stored as part of that entry.
  7817. **
  7818. ** Only the data content may only be modified, it is not possible to
  7819. ** change the length of the data stored. If this function is called with
  7820. ** parameters that attempt to write past the end of the existing data,
  7821. ** no modifications are made and SQLITE_CORRUPT is returned.
  7822. */
  7823. int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
  7824. int rc;
  7825. assert( cursorHoldsMutex(pCsr) );
  7826. assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
  7827. assert( pCsr->isIncrblobHandle );
  7828. rc = restoreCursorPosition(pCsr);
  7829. if( rc!=SQLITE_OK ){
  7830. return rc;
  7831. }
  7832. assert( pCsr->eState!=CURSOR_REQUIRESEEK );
  7833. if( pCsr->eState!=CURSOR_VALID ){
  7834. return SQLITE_ABORT;
  7835. }
  7836. /* Save the positions of all other cursors open on this table. This is
  7837. ** required in case any of them are holding references to an xFetch
  7838. ** version of the b-tree page modified by the accessPayload call below.
  7839. **
  7840. ** Note that pCsr must be open on a BTREE_INTKEY table and saveCursorPosition()
  7841. ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
  7842. ** saveAllCursors can only return SQLITE_OK.
  7843. */
  7844. VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
  7845. assert( rc==SQLITE_OK );
  7846. /* Check some assumptions:
  7847. ** (a) the cursor is open for writing,
  7848. ** (b) there is a read/write transaction open,
  7849. ** (c) the connection holds a write-lock on the table (if required),
  7850. ** (d) there are no conflicting read-locks, and
  7851. ** (e) the cursor points at a valid row of an intKey table.
  7852. */
  7853. if( !pCsr->wrFlag ){
  7854. return SQLITE_READONLY;
  7855. }
  7856. assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
  7857. && pCsr->pBt->inTransaction==TRANS_WRITE );
  7858. assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
  7859. assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
  7860. assert( pCsr->apPage[pCsr->iPage]->intKey );
  7861. return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
  7862. }
  7863. /*
  7864. ** Set a flag on this cursor to cache the locations of pages from the
  7865. ** overflow list for the current row. This is used by cursors opened
  7866. ** for incremental blob IO only.
  7867. **
  7868. ** This function sets a flag only. The actual page location cache
  7869. ** (stored in BtCursor.aOverflow[]) is allocated and used by function
  7870. ** accessPayload() (the worker function for sqlite3BtreeData() and
  7871. ** sqlite3BtreePutData()).
  7872. */
  7873. void sqlite3BtreeCacheOverflow(BtCursor *pCur){
  7874. assert( cursorHoldsMutex(pCur) );
  7875. assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  7876. invalidateOverflowCache(pCur);
  7877. pCur->isIncrblobHandle = 1;
  7878. }
  7879. #endif
  7880. /*
  7881. ** Set both the "read version" (single byte at byte offset 18) and
  7882. ** "write version" (single byte at byte offset 19) fields in the database
  7883. ** header to iVersion.
  7884. */
  7885. int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
  7886. BtShared *pBt = pBtree->pBt;
  7887. int rc; /* Return code */
  7888. assert( iVersion==1 || iVersion==2 );
  7889. /* If setting the version fields to 1, do not automatically open the
  7890. ** WAL connection, even if the version fields are currently set to 2.
  7891. */
  7892. pBt->btsFlags &= ~BTS_NO_WAL;
  7893. if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
  7894. rc = sqlite3BtreeBeginTrans(pBtree, 0);
  7895. if( rc==SQLITE_OK ){
  7896. u8 *aData = pBt->pPage1->aData;
  7897. if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
  7898. rc = sqlite3BtreeBeginTrans(pBtree, 2);
  7899. if( rc==SQLITE_OK ){
  7900. rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  7901. if( rc==SQLITE_OK ){
  7902. aData[18] = (u8)iVersion;
  7903. aData[19] = (u8)iVersion;
  7904. }
  7905. }
  7906. }
  7907. }
  7908. pBt->btsFlags &= ~BTS_NO_WAL;
  7909. return rc;
  7910. }
  7911. /*
  7912. ** set the mask of hint flags for cursor pCsr. Currently the only valid
  7913. ** values are 0 and BTREE_BULKLOAD.
  7914. */
  7915. void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){
  7916. assert( mask==BTREE_BULKLOAD || mask==0 );
  7917. pCsr->hints = mask;
  7918. }